From 10aa63f0e112eac79e5fcafc6dc75961c5b76403 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Sat, 7 Oct 2023 14:03:47 +0800
Subject: [PATCH 001/153] support optimized sp

---
 configs/7B_sft.py                      |   6 +-
 internlm/model/linear.py               | 219 ++++++++++++++++++++++++-
 internlm/model/modeling_internlm.py    |  22 ++-
 internlm/model/multi_head_attention.py | 140 +++++++++++++++-
 train.py                               |  21 +--
 5 files changed, 378 insertions(+), 30 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 25a98bf8..a23edcec 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -146,10 +146,10 @@
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=8,
-    tensor=1,
+    zero1=-1,
+    tensor=2,
     pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=False,
+    sequence_parallel=True,
 )
 
 cudnn_deterministic = False
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index d18308a8..5ee1af9d 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -5,13 +5,32 @@
 
 import torch
 from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
-from flash_attn.utils.distributed import all_reduce, reduce_scatter
+from flash_attn.utils.distributed import all_reduce, reduce_scatter, all_gather_raw, reduce_scatter_raw
+from torch import Tensor
 from torch import nn
+from torch.cuda.amp import custom_bwd, custom_fwd
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.utils import Silu, fused_dense_func_torch
 
+from typing import Optional
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributed import ProcessGroup
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+# import fused_dense_cuda  # from apex
+import fused_dense_lib as fused_dense_cuda
+
+from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_fwd, sqrelu_bwd
+from flash_attn.utils.distributed import all_gather_raw, reduce_scatter_raw, all_reduce_raw
+from flash_attn.utils.distributed import reduce_scatter, all_reduce
+
 
 class ScaleColumnParallelLinear(nn.Linear):
     """
@@ -200,3 +219,201 @@ def forward(self, x):
         w2_o = self.w2(x)
         out = self.w3(Silu(w1_o, w2_o))
         return out
+
+class FusedDenseFunc_fsdp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
+
+        ctx.compute_weight_gradient = weight.requires_grad
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        total_x = x
+        
+        # do all_gather for weight and bias before actual computation
+        total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+        if bias is not None:
+            total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
+            handle_bias.wait()
+        else:
+            total_bias = bias
+
+        if torch.is_autocast_enabled():
+            total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+        handle_weight.wait()
+        total_weight = total_weight.contiguous()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *total_weight.shape) > 65535 * 32:
+            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
+        output = F.linear(total_x, total_weight, total_bias)
+        if ctx.compute_weight_gradient:
+            ctx.save_for_backward(x, weight)
+        else:
+            ctx.save_for_backward(weight)
+        return output if not return_residual else (output, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            grad_input, = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        if ctx.compute_weight_gradient:
+            x, weight = ctx.saved_tensors
+            total_x = x
+        else:
+            weight, = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        
+        # do all-gather for weight before backward
+        weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+        handle_weight.wait()
+        
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                                         grad_output, weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            # if process_group is not None:
+                # import pdb; pdb.set_trace()
+                # grad_input, handle_grad_input = reduce_scatter_raw(grad_input, process_group, async_op=True)
+                # grad_input, handle_grad_input = all_reduce_raw(grad_input, process_group, async_op=True)
+                
+        else:
+            grad_input = None
+        # import pdb; pdb.set_trace()
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+
+            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+            grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+            if grad_bias is not None:
+                grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                handle_grad_bias.wait()
+            handle_grad_weight.wait()
+            
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        # if process_group is not None and ctx.needs_input_grad[0]:
+            # handle_grad_input.wait()
+        # import pdb; pdb.set_trace()
+        return grad_input, grad_weight, grad_bias, None, None, None
+
+
+def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
+                     return_residual: bool = False, process_group = None):
+    dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16]
+                      or (x.dtype == torch.float32 and torch.is_autocast_enabled()))
+    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
+        return FusedDenseFunc_fsdp.apply(x, weight, bias, return_residual, process_group)
+    else:
+        assert process_group is None
+        out = F.linear(x, weight, bias)
+        return out if not return_residual else (out, x)
+
+class FSDPLinear(ColumnParallelLinear):
+    
+    def forward(self, x):
+        return fsdp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group)
+
+
+class FSDPScaleLinear(ScaleColumnParallelLinear):
+    
+    def forward(self, input):  # pylint: disable=W0622
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        if self.weight_scale != 1:
+            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
+        else:
+            weight = self.weight
+        return fsdp_fused_dense_func(
+            input,
+            weight,
+            self.bias,
+            process_group=self.process_group,
+        )
+
+
+class FSDPFeedForward(nn.Module):
+    """
+    FeedForward.
+
+    Args:
+        in_features (int): size of each input sample
+        hidden_features (int): size of hidden state of FFN
+        out_features (int): size of each output sample
+        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_features: int = None,
+        process_group: Optional[torch.distributed.ProcessGroup] = None,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        multiple_of: int = 256,
+    ):
+        super().__init__()
+
+        hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
+
+        self.w1 = FSDPLinear(
+            in_features,
+            hidden_features,
+            process_group,
+            bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+        self.w2 = FSDPLinear(
+            in_features,
+            hidden_features,
+            process_group,
+            bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+        self.w3 = FSDPLinear(
+            hidden_features,
+            out_features,
+            process_group,
+            bias=bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, x):
+        w1_o = self.w1(x)
+        w2_o = self.w2(x)
+        out = self.w3(Silu(w1_o, w2_o))
+        return out
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 2856a782..8ac8c58d 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -17,9 +17,11 @@
     FeedForward,
     RewardModelLinear,
     ScaleColumnParallelLinear,
+    FSDPScaleLinear,
+    FSDPFeedForward,
 )
 from internlm.model.multi_head_attention import MHA
-from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm
+from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm, split_forward_gather_backward
 from internlm.solver.pipeline_utils import partition_uniform
 from internlm.utils.checkpoint import activation_checkpoint
 from internlm.utils.common import filter_kwargs
@@ -107,7 +109,16 @@ def __init__(
             self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
 
         if use_swiglu:
-            self.mlp = FeedForward(
+            # self.mlp = FeedForward(
+            #     hidden_size,
+            #     int(hidden_size * mlp_ratio),
+            #     out_features=hidden_size,
+            #     process_group=gpc.get_group(ParallelMode.TENSOR),
+            #     bias=False,
+            #     device=device,
+            #     dtype=dtype,
+            # )
+            self.mlp = FSDPFeedForward(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
                 out_features=hidden_size,
@@ -293,7 +304,8 @@ def __init__(
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            head_cls = ScaleColumnParallelLinear
+            # head_cls = ScaleColumnParallelLinear
+            head_cls = FSDPScaleLinear
         if first:
             if embed_split_hidden:
                 self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
@@ -379,6 +391,9 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             assert len(indexes) == 1
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
+            if gpc.config.parallel.sequence_parallel:
+                indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
+        
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
 
         for _, block in enumerate(self.blocks):
@@ -394,6 +409,7 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             hidden_states = self.norm(hidden_states.float())
         if hasattr(self, "head"):
             hidden_states = self.head(hidden_states)
+            hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=0)
 
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index e4008e15..abb9f19c 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -18,7 +18,114 @@
 from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding
-from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch
+from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch, FSDPLinear
+
+import torch
+
+from typing import Any, Tuple
+from torch import Tensor
+from torch.nn import Module
+
+import torch.distributed as dist
+
+
+class _SeqAllToAll(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
+
+        ctx.group = group
+        ctx.scatter_idx = scatter_idx
+        ctx.gather_idx = gather_idx
+
+        seq_world_size = dist.get_world_size(group)
+
+        input_list = [t.contiguous() for t in torch.tensor_split(input, seq_world_size, scatter_idx)]
+        output_list = [torch.empty_like(input_list[0]) for _ in range(seq_world_size)]
+        # TODO Use all_to_all_single instead
+        dist.all_to_all(output_list, input_list, group=group)
+        return torch.cat(output_list, dim=gather_idx).contiguous()
+
+    @staticmethod
+    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+        return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None)
+
+
+class DistributedAttention(torch.nn.Module):
+    """Initialization.
+
+    Arguments:
+        local_attention (Module): local attention with q,k,v
+        sequence_process_group (ProcessGroup): sequence parallel process group
+        scatter_idx (int): scatter_idx for all2all comm
+        gather_idx (int): gather_idx for all2all comm
+    """
+
+    def __init__(
+        self,
+        local_attention: Module,
+        sequence_process_group: dist.ProcessGroup,
+        scatter_idx: int = 2,
+        gather_idx: int = 0,
+    ) -> None:
+
+        super(DistributedAttention, self).__init__()
+        self.local_attn = local_attention
+        self.spg = sequence_process_group
+        self.scatter_idx = scatter_idx
+        self.gather_idx = gather_idx
+
+    # def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any) -> Tensor:
+    #     """ forward
+
+    #     Arguments:
+    #         query (Tensor): query input to the layer
+    #         key (Tensor): key input to the layer
+    #         value (Tensor): value input to the layer
+    #         args: other args
+
+    #     Returns:
+    #         * output (Tensor): context output
+    #     """
+    #     # TODO Merge three alltoall calls into one
+    #     #in shape : e.g.,  [s/p:h:]
+    #     query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx)
+    #     key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
+    #     value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
+
+    #     #out shape : e.g., [s:h/p:]
+    #     context_layer = self.local_attn(query_layer, key_layer, value_layer, *args)
+
+    #     output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx)
+
+    #     #out e.g., [s/p::h]
+    #     return output
+    
+    def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor:
+        """ forward
+
+        Arguments:
+            query (Tensor): query input to the layer
+            key (Tensor): key input to the layer
+            value (Tensor): value input to the layer
+            args: other args
+
+        Returns:
+            * output (Tensor): context output
+        """
+        # TODO Merge three alltoall calls into one
+        #in shape : e.g.,  [s/p:h:]
+        qkv = _SeqAllToAll.apply(self.spg, qkv, self.scatter_idx, self.gather_idx)
+        # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
+        # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
+
+        #out shape : e.g., [s:h/p:]
+        context_layer = self.local_attn(qkv, **kwargs)
+
+        output = _SeqAllToAll.apply(self.spg, context_layer, 0, 2)
+
+        #out e.g., [s/p::h]
+        return output
 
 
 class MHA(nn.Module):
@@ -91,7 +198,16 @@ def __init__(
                 self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)
 
         # notice here should change bias=True
-        self.Wqkv = ColumnParallelLinearTorch(
+        # self.Wqkv = ColumnParallelLinearTorch(
+        #     embed_dim,
+        #     3 * embed_dim,
+        #     process_group,
+        #     bias=True,
+        #     sequence_parallel=gpc.config.parallel.sequence_parallel,
+        #     **factory_kwargs,
+        # )  # according to https://spaces.ac.cn/archives/9577
+        
+        self.Wqkv = FSDPLinear(
             embed_dim,
             3 * embed_dim,
             process_group,
@@ -106,9 +222,19 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
+        
+        self.inner_attn_sp = DistributedAttention(self.inner_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0)
+        self.inner_cross_attn_sp = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0)
 
         # output projection always have the bias (for now)
-        self.out_proj = RowParallelLinearTorch(
+        # self.out_proj = RowParallelLinearTorch(
+        #     embed_dim,
+        #     embed_dim,
+        #     process_group,
+        #     sequence_parallel=gpc.config.parallel.sequence_parallel,
+        #     **factory_kwargs,
+        # )
+        self.out_proj = FSDPLinear(
             embed_dim,
             embed_dim,
             process_group,
@@ -211,15 +337,17 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
         qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim)  # total x 3 x n_head x d
         qkv = self.rotary_emb(qkv, **kwargs)
         kwargs.pop("indexes")
-
+        
         if inference_params is None:
             if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                     if qkv.dtype not in [torch.float16, torch.bfloat16]:
                         qkv = qkv.to(torch.bfloat16)
-                    context = self.inner_attn(qkv, **kwargs).to(x.dtype)
+                    # context = self.inner_attn(qkv, **kwargs).to(x.dtype)
+                    context = self.inner_attn_sp(qkv, **kwargs).to(x.dtype)
             else:
-                context = self.inner_attn(qkv, **kwargs)
+                # context = self.inner_attn(qkv, **kwargs)
+                context = self.inner_attn_sp(qkv, **kwargs)
 
         else:
             raise RuntimeError("Not support this right now")
diff --git a/train.py b/train.py
index 139bac1f..9bc4bd7f 100644
--- a/train.py
+++ b/train.py
@@ -110,7 +110,6 @@ def main(args):
 
     # initialize and resume train state
     train_state = TrainState(gpc.config, train_dl.batch_sampler)
-
     optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
 
     ckpt_manager = CheckpointManager(
@@ -170,6 +169,7 @@ def main(args):
         beta2_scheduler=beta2_scheduler,
         scheduler_hooks=scheduler_hooks,
     )
+    
 
     # initialize simple memory profiler
     if args.profiling:
@@ -219,21 +219,9 @@ def main(args):
             # do forward and backward
             timer("fwd-bwd").start()
 
-            moe_loss = None
-            if hasattr(gpc.config.model, "num_experts"):
-                _, _, loss, moe_loss = trainer.execute_schedule(
-                    batch,
-                    forward_only=False,
-                    return_loss=True,
-                    return_output_label=False,
-                )
-            else:
-                _, _, loss = trainer.execute_schedule(
-                    batch,
-                    forward_only=False,
-                    return_loss=True,
-                    return_output_label=False,
-                )
+            _, _, loss = trainer.execute_schedule(
+                batch, forward_only=False, return_loss=True, return_output_label=False
+            )
             timer("fwd-bwd").stop()
 
             # update parameters, and returns (success_update, grad_norm)
@@ -266,7 +254,6 @@ def main(args):
                 trainer=trainer,
                 start_time=start_time,
                 loss=loss,
-                moe_loss=moe_loss,
                 grad_norm=grad_norm_groups,
                 metric=metric,
                 update_panel=uniscale_logger is not None,

From bf475b694014b77159240feabba310a704cdbfdd Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Sun, 8 Oct 2023 13:20:29 +0800
Subject: [PATCH 002/153] debug

---
 configs/7B_sft.py                             |  4 ++--
 .../core/scheduler/no_pipeline_scheduler.py   |  4 ++--
 internlm/model/linear.py                      | 19 +++++++++++++++----
 internlm/model/modeling_internlm.py           |  1 +
 train.py                                      |  1 +
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 20119343..51d2e9c4 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -5,7 +5,7 @@
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 32
+NUM_LAYER = 4
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@@ -55,7 +55,7 @@
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
-    valid_every=50,
+    valid_every=1000,
     pack_sample_into_one=False,
     total_steps=50000,
     skip_batches="",
diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index 56661d8c..97687904 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -202,10 +202,10 @@ def forward_backward_step(
             if return_output_label:
                 outputs.append(_output)
                 labels.append(_label)
-
+        
         if not return_output_label:
             outputs, labels = None, None
-
+        
         # Compatible for non-moe
         if hasattr(gpc.config.model, "num_experts"):
             return outputs, labels, loss, moe_loss
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 5ee1af9d..5ea0e80b 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -28,9 +28,20 @@
 import fused_dense_lib as fused_dense_cuda
 
 from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_fwd, sqrelu_bwd
-from flash_attn.utils.distributed import all_gather_raw, reduce_scatter_raw, all_reduce_raw
+from flash_attn.utils.distributed import all_gather_raw, all_reduce_raw 
+# reduce_scatter_raw
 from flash_attn.utils.distributed import reduce_scatter, all_reduce
 
+def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, op=torch.distributed.ReduceOp.SUM):
+    world_size = torch.distributed.get_world_size(process_group)
+    assert input_.shape[0] % world_size == 0
+    output = torch.empty(
+        input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
+    )
+    handle = torch.distributed.reduce_scatter_tensor(
+        output, input_.contiguous(), op=op, group=process_group, async_op=async_op
+    )
+    return output, handle
 
 class ScaleColumnParallelLinear(nn.Linear):
     """
@@ -279,15 +290,15 @@ def backward(ctx, grad_output, *args):
         grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
         
         # do all-gather for weight before backward
-        weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+        total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
         handle_weight.wait()
         
         if ctx.needs_input_grad[0]:
             if not ctx.return_residual:
-                grad_input = F.linear(grad_output, weight.t())
+                grad_input = F.linear(grad_output, total_weight.t())
             else:
                 grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
-                                         grad_output, weight)
+                                         grad_output, total_weight)
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
             # if process_group is not None:
                 # import pdb; pdb.set_trace()
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 8ac8c58d..0db99ad0 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -372,6 +372,7 @@ def __init__(
 
     def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
         # attention_mask: compute attention on the places where the value is 1
+        import pdb; pdb.set_trace()
         if hasattr(self, "embedding"):
             hidden_states = self.embedding(input_ids)
             if self.embed_grad_scale != 1:
diff --git a/train.py b/train.py
index 9bc4bd7f..1adcc22a 100644
--- a/train.py
+++ b/train.py
@@ -254,6 +254,7 @@ def main(args):
                 trainer=trainer,
                 start_time=start_time,
                 loss=loss,
+                moe_loss=None,
                 grad_norm=grad_norm_groups,
                 metric=metric,
                 update_panel=uniscale_logger is not None,

From bd4af3a31f595ed6e587e5dccefca14535d9b8dd Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Sun, 8 Oct 2023 17:21:17 +0800
Subject: [PATCH 003/153] modify the all2all

---
 configs/7B_sft.py                      | 2 +-
 internlm/model/modeling_internlm.py    | 1 -
 internlm/model/multi_head_attention.py | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 51d2e9c4..5e3e0c93 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -5,7 +5,7 @@
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 4
+NUM_LAYER = 32
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 0db99ad0..8ac8c58d 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -372,7 +372,6 @@ def __init__(
 
     def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
         # attention_mask: compute attention on the places where the value is 1
-        import pdb; pdb.set_trace()
         if hasattr(self, "embedding"):
             hidden_states = self.embedding(input_ids)
             if self.embed_grad_scale != 1:
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index abb9f19c..e6d0a297 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -115,14 +115,14 @@ def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor:
         """
         # TODO Merge three alltoall calls into one
         #in shape : e.g.,  [s/p:h:]
-        qkv = _SeqAllToAll.apply(self.spg, qkv, self.scatter_idx, self.gather_idx)
+        qkv = _SeqAllToAll.apply(self.spg, qkv, 2, 0)
         # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
         # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
 
         #out shape : e.g., [s:h/p:]
         context_layer = self.local_attn(qkv, **kwargs)
 
-        output = _SeqAllToAll.apply(self.spg, context_layer, 0, 2)
+        output = _SeqAllToAll.apply(self.spg, context_layer, 0, 1)
 
         #out e.g., [s/p::h]
         return output

From 189a313da6a6b6710f07f7e5e13cacb56eeb7256 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 17:26:20 +0800
Subject: [PATCH 004/153] support fstp and refactor code

---
 configs/7B_sft.py                         |  10 +--
 internlm/core/context/parallel_context.py |   3 +-
 internlm/initialize/launch.py             |   6 ++
 internlm/model/linear.py                  |  91 +++++++------------
 internlm/model/modeling_internlm.py       |  29 +++---
 internlm/model/multi_head_attention.py    | 104 ++++++++--------------
 internlm/utils/evaluation.py              |   5 +-
 7 files changed, 104 insertions(+), 144 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 5e3e0c93..6758167a 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -5,7 +5,7 @@
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 32
+NUM_LAYER = 4
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@@ -55,7 +55,7 @@
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
-    valid_every=1000,
+    valid_every=10,
     pack_sample_into_one=False,
     total_steps=50000,
     skip_batches="",
@@ -64,7 +64,7 @@
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 
@@ -135,7 +135,7 @@
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
     use_flash_attn=True,
@@ -155,7 +155,7 @@
 """
 parallel = dict(
     zero1=-1,
-    tensor=2,
+    tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 7f3e415a..da6a0d7e 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -568,7 +568,8 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False):
         # during model construction), this is because the random state will be different in different tensor parallel
         # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform
         # additional random operations during the RowParallelLinear module building process.
-        set_mode(ParallelMode.DUMMY)
+        # set_mode(ParallelMode.DUMMY)
+        set_mode(ParallelMode.TENSOR)
 
         seeds = get_seeds()
         seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()])
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 660cc559..895779e3 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -279,6 +279,12 @@ def args_sanity_check():
         assert not (
             gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
         ), "sequence parallel does not support use_flash_attn=False"
+    
+    if gpc.config.parallel["tensor"].get("mode", None) is None:
+        gpc.config.parallel["tensor"]["mode"] = "origin_tp"
+    
+    if gpc.config.parallel["tensor"].get("mode", None) is 'fstp':
+        assert gpc.config.parallel.sequence_parallel is True, "when the tp_mode is fstp, the sequence_parallel should be True."
 
     # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy
     if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1:
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 5ea0e80b..60a3d272 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -4,44 +4,20 @@
 from typing import Optional
 
 import torch
+import torch.nn.functional as F
 from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
 from flash_attn.utils.distributed import all_reduce, reduce_scatter, all_gather_raw, reduce_scatter_raw
 from torch import Tensor
 from torch import nn
 from torch.cuda.amp import custom_bwd, custom_fwd
 
+# import fused_dense_cuda  # from apex
+import fused_dense_lib as fused_dense_cuda
+
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.utils import Silu, fused_dense_func_torch
 
-from typing import Optional
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from torch.distributed import ProcessGroup
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-# import fused_dense_cuda  # from apex
-import fused_dense_lib as fused_dense_cuda
-
-from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_fwd, sqrelu_bwd
-from flash_attn.utils.distributed import all_gather_raw, all_reduce_raw 
-# reduce_scatter_raw
-from flash_attn.utils.distributed import reduce_scatter, all_reduce
-
-def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, op=torch.distributed.ReduceOp.SUM):
-    world_size = torch.distributed.get_world_size(process_group)
-    assert input_.shape[0] % world_size == 0
-    output = torch.empty(
-        input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
-    )
-    handle = torch.distributed.reduce_scatter_tensor(
-        output, input_.contiguous(), op=op, group=process_group, async_op=async_op
-    )
-    return output, handle
 
 class ScaleColumnParallelLinear(nn.Linear):
     """
@@ -231,7 +207,7 @@ def forward(self, x):
         out = self.w3(Silu(w1_o, w2_o))
         return out
 
-class FusedDenseFunc_fsdp(torch.autograd.Function):
+class FSDPFusedDenseFunc(torch.autograd.Function):
 
     @staticmethod
     @custom_fwd
@@ -243,21 +219,26 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        total_x = x
+        total_x = x.contiguous()
         
-        # do all_gather for weight and bias before actual computation
-        total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-        if bias is not None:
-            total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
-            handle_bias.wait()
+        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        if world_size > 1:
+            # do all_gather for weight and bias before actual computation
+            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+            if bias is not None:
+                total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
+                handle_bias.wait()
+            else:
+                total_bias = bias
+            handle_weight.wait()
         else:
+            total_weight = weight
             total_bias = bias
 
         if torch.is_autocast_enabled():
             total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype())
             total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
-        handle_weight.wait()
+       
         total_weight = total_weight.contiguous()
         batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
         batch_dim = batch_shape.numel()
@@ -289,9 +270,13 @@ def backward(ctx, grad_output, *args):
         batch_dim = batch_shape.numel()
         grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
         
-        # do all-gather for weight before backward
-        total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-        handle_weight.wait()
+        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        if world_size > 1:
+            # do all-gather for weight before backward
+            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+            handle_weight.wait()
+        else:
+            total_weight = weight
         
         if ctx.needs_input_grad[0]:
             if not ctx.return_residual:
@@ -300,32 +285,24 @@ def backward(ctx, grad_output, *args):
                 grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
                                          grad_output, total_weight)
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            # if process_group is not None:
-                # import pdb; pdb.set_trace()
-                # grad_input, handle_grad_input = reduce_scatter_raw(grad_input, process_group, async_op=True)
-                # grad_input, handle_grad_input = all_reduce_raw(grad_input, process_group, async_op=True)
-                
         else:
             grad_input = None
-        # import pdb; pdb.set_trace()
+
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
 
             grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
                 total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
             )
-            grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
-            if grad_bias is not None:
-                grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
-                handle_grad_bias.wait()
-            handle_grad_weight.wait()
-            
+            if world_size > 1:
+                grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                if grad_bias is not None:
+                    grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) 
+                    handle_grad_bias.wait()
+                handle_grad_weight.wait()
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
-        # if process_group is not None and ctx.needs_input_grad[0]:
-            # handle_grad_input.wait()
-        # import pdb; pdb.set_trace()
         return grad_input, grad_weight, grad_bias, None, None, None
 
 
@@ -334,7 +311,7 @@ def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = No
     dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16]
                       or (x.dtype == torch.float32 and torch.is_autocast_enabled()))
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FusedDenseFunc_fsdp.apply(x, weight, bias, return_residual, process_group)
+        return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group)
     else:
         assert process_group is None
         out = F.linear(x, weight, bias)
@@ -426,5 +403,5 @@ def __init__(
     def forward(self, x):
         w1_o = self.w1(x)
         w2_o = self.w2(x)
-        out = self.w3(Silu(w1_o, w2_o))
+        out = self.w3(F.silu(w1_o) * w2_o)
         return out
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 8ac8c58d..47d706f6 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -74,6 +74,7 @@ def __init__(
         use_scaled_init: bool = True,
         use_swiglu: bool = True,
         use_flash_attn: bool = True,
+        tp_mode: str = 'origin_tp',
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -98,6 +99,7 @@ def __init__(
             use_flash_attn=use_flash_attn,
             device=device,
             dtype=dtype,
+            tp_mode=tp_mode,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -109,16 +111,8 @@ def __init__(
             self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
 
         if use_swiglu:
-            # self.mlp = FeedForward(
-            #     hidden_size,
-            #     int(hidden_size * mlp_ratio),
-            #     out_features=hidden_size,
-            #     process_group=gpc.get_group(ParallelMode.TENSOR),
-            #     bias=False,
-            #     device=device,
-            #     dtype=dtype,
-            # )
-            self.mlp = FSDPFeedForward(
+            mlp_cls = FeedForward if tp_mode == 'origin_tp' else FSDPFeedForward
+            self.mlp = mlp_cls(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
                 out_features=hidden_size,
@@ -179,6 +173,7 @@ def reset_parameters(self):
                     else:
                         normal_(std=0.006 if "fc1" in name else 0.0015)(param.data)
 
+
     def forward(self, hidden_states, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None):
         if self.checkpoint and self.training:
             return activation_checkpoint(
@@ -300,12 +295,12 @@ def __init__(
         super().__init__()
 
         checkpoint_layer_num = int(num_layers * checkpoint)
+        self.tp_mode = gpc.config.parallel["tensor"]["mode"]
 
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            # head_cls = ScaleColumnParallelLinear
-            head_cls = FSDPScaleLinear
+            head_cls = ScaleColumnParallelLinear
         if first:
             if embed_split_hidden:
                 self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
@@ -346,6 +341,7 @@ def __init__(
                     use_scaled_init=use_scaled_init,
                     use_swiglu=use_swiglu,
                     use_flash_attn=use_flash_attn,
+                    tp_mode = self.tp_mode,
                 )
                 for lid in range(num_layers)
             ]
@@ -391,7 +387,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             assert len(indexes) == 1
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
-            if gpc.config.parallel.sequence_parallel:
+            # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension.
+            if gpc.config.parallel.sequence_parallel and self.tp_mode == 'fstp':
                 indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
         
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
@@ -408,8 +405,12 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
         if hasattr(self, "norm"):
             hidden_states = self.norm(hidden_states.float())
         if hasattr(self, "head"):
+            # if hidden_states.ndim == 3:
+            #     import pdb; pdb.set_trace()
+            #     hidden_states = self.head(hidden_states, dim=1)
+            # else:
+            #     hidden_states = self.head(hidden_states)
             hidden_states = self.head(hidden_states)
-            hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=0)
 
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index e6d0a297..8f7a064d 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -57,49 +57,29 @@ class DistributedAttention(torch.nn.Module):
     Arguments:
         local_attention (Module): local attention with q,k,v
         sequence_process_group (ProcessGroup): sequence parallel process group
-        scatter_idx (int): scatter_idx for all2all comm
-        gather_idx (int): gather_idx for all2all comm
+        first_scatter_idx (int): scatter_idx for the first all2all comm
+        first_gather_idx (int): gather_idx for the first all2all comm
+        second_scatter_idx (int): scatter_idx for the second all2all comm
+        second_gather_idx (int): gather_idx for the second all2all comm
     """
 
     def __init__(
         self,
         local_attention: Module,
         sequence_process_group: dist.ProcessGroup,
-        scatter_idx: int = 2,
-        gather_idx: int = 0,
+        first_scatter_idx: int = 2,
+        first_gather_idx: int = 0,
+        second_scatter_idx: int = 0,
+        second_gather_idx: int = 1,
     ) -> None:
 
         super(DistributedAttention, self).__init__()
         self.local_attn = local_attention
         self.spg = sequence_process_group
-        self.scatter_idx = scatter_idx
-        self.gather_idx = gather_idx
-
-    # def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any) -> Tensor:
-    #     """ forward
-
-    #     Arguments:
-    #         query (Tensor): query input to the layer
-    #         key (Tensor): key input to the layer
-    #         value (Tensor): value input to the layer
-    #         args: other args
-
-    #     Returns:
-    #         * output (Tensor): context output
-    #     """
-    #     # TODO Merge three alltoall calls into one
-    #     #in shape : e.g.,  [s/p:h:]
-    #     query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx)
-    #     key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
-    #     value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
-
-    #     #out shape : e.g., [s:h/p:]
-    #     context_layer = self.local_attn(query_layer, key_layer, value_layer, *args)
-
-    #     output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx)
-
-    #     #out e.g., [s/p::h]
-    #     return output
+        self.first_scatter_idx = first_scatter_idx
+        self.first_gather_idx = first_gather_idx
+        self.second_scatter_idx = second_scatter_idx
+        self.second_gather_idx = second_gather_idx
     
     def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor:
         """ forward
@@ -114,15 +94,21 @@ def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor:
             * output (Tensor): context output
         """
         # TODO Merge three alltoall calls into one
-        #in shape : e.g.,  [s/p:h:]
-        qkv = _SeqAllToAll.apply(self.spg, qkv, 2, 0)
-        # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
-        # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
-
-        #out shape : e.g., [s:h/p:]
-        context_layer = self.local_attn(qkv, **kwargs)
-
-        output = _SeqAllToAll.apply(self.spg, context_layer, 0, 1)
+        if qkv.ndim == 5:
+            # in shape: [seq/tp_size, 3, head, head_dim]
+            qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx + 1, self.first_gather_idx + 1)
+            #out shape : [seq, head/tp_size, head_dim]
+            context_layer = self.local_attn(qkv, **kwargs)
+            # in shape: [seq, head/tp_size, head_dim]
+            output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1)
+        else:
+            
+            # in shape: [seq/tp_size, 3, head, head_dim]
+            qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx, self.first_gather_idx)
+            #out shape : [seq, head/tp_size, head_dim]
+            context_layer = self.local_attn(qkv, **kwargs)
+            # in shape: [seq, head/tp_size, head_dim]
+            output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx, self.second_gather_idx)
 
         #out e.g., [s/p::h]
         return output
@@ -171,6 +157,7 @@ def __init__(
         use_flash_attn: bool = True,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
+        tp_mode: str = 'origin_tp',
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
@@ -198,16 +185,8 @@ def __init__(
                 self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)
 
         # notice here should change bias=True
-        # self.Wqkv = ColumnParallelLinearTorch(
-        #     embed_dim,
-        #     3 * embed_dim,
-        #     process_group,
-        #     bias=True,
-        #     sequence_parallel=gpc.config.parallel.sequence_parallel,
-        #     **factory_kwargs,
-        # )  # according to https://spaces.ac.cn/archives/9577
-        
-        self.Wqkv = FSDPLinear(
+        Wqkv_cls = ColumnParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear
+        self.Wqkv = Wqkv_cls(
             embed_dim,
             3 * embed_dim,
             process_group,
@@ -222,25 +201,20 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        
-        self.inner_attn_sp = DistributedAttention(self.inner_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0)
-        self.inner_cross_attn_sp = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0)
+        if tp_mode == 'fstp':
+            self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group)
+            self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group)
 
         # output projection always have the bias (for now)
-        # self.out_proj = RowParallelLinearTorch(
-        #     embed_dim,
-        #     embed_dim,
-        #     process_group,
-        #     sequence_parallel=gpc.config.parallel.sequence_parallel,
-        #     **factory_kwargs,
-        # )
-        self.out_proj = FSDPLinear(
+        out_proj_cls = RowParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear
+        self.out_proj = out_proj_cls(
             embed_dim,
             embed_dim,
             process_group,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             **factory_kwargs,
         )
+        
         # need to assign tp attribute so that internlm know it is tensor parallel module
         if gpc.get_world_size(ParallelMode.TENSOR) > 1:
             for name in ["out_proj", "Wqkv"]:
@@ -343,11 +317,9 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                     if qkv.dtype not in [torch.float16, torch.bfloat16]:
                         qkv = qkv.to(torch.bfloat16)
-                    # context = self.inner_attn(qkv, **kwargs).to(x.dtype)
-                    context = self.inner_attn_sp(qkv, **kwargs).to(x.dtype)
+                    context = self.inner_attn(qkv, **kwargs).to(x.dtype)
             else:
-                # context = self.inner_attn(qkv, **kwargs)
-                context = self.inner_attn_sp(qkv, **kwargs)
+                context = self.inner_attn(qkv, **kwargs)
 
         else:
             raise RuntimeError("Not support this right now")
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index 6a55fa56..2a11a478 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -54,7 +54,10 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
 def switch_sequence_parallel_mode():
     prev_mode = gpc.config.parallel.sequence_parallel
     try:
-        gpc.config.parallel.sequence_parallel = False
+        if gpc.config.parallel["tensor"]["mode"] == 'fstp':
+            gpc.config.parallel.sequence_parallel = True
+        else:
+            gpc.config.parallel.sequence_parallel = False
         yield
     finally:
         gpc.config.parallel.sequence_parallel = prev_mode

From 21c1a7fa47bc49eca26dc63d33a1f57d855e15dd Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 18:01:06 +0800
Subject: [PATCH 005/153] support evaluation with fstp

---
 configs/7B_sft.py                   |   4 +-
 internlm/model/linear.py            |   7 +-
 internlm/model/modeling_internlm.py |  10 +--
 internlm/model/utils.py             | 121 ++++++++++++++++++++++++++--
 4 files changed, 124 insertions(+), 18 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 6758167a..3e1d0780 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -5,7 +5,7 @@
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 4
+NUM_LAYER = 32
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@@ -155,7 +155,7 @@
 """
 parallel = dict(
     zero1=-1,
-    tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'
+    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 60a3d272..fbe6f141 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -54,7 +54,7 @@ def __init__(
         self.process_group = process_group
         self.weight_scale = weight_scale
 
-    def forward(self, input):  # pylint: disable=W0622
+    def forward(self, input, gather_dim=0):  # pylint: disable=W0622
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
         # we do an all_gather of x before doing the matmul.
         # If not, then the input is already gathered.
@@ -68,6 +68,7 @@ def forward(self, input):  # pylint: disable=W0622
             self.bias,
             process_group=self.process_group,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
+            gather_dim=gather_dim,
         )
 
 
@@ -121,13 +122,13 @@ def forward(self, input):  # pylint: disable=W0622
 
 
 class ColumnParallelLinearTorch(ColumnParallelLinear):
-    def forward(self, x):
+    def forward(self, x, gather_dim=0):
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
         # we do an all_gather of x before doing the matmul.
         # If not, then the input is already gathered.
 
         return fused_dense_func_torch(
-            x, self.weight, self.bias, process_group=self.process_group, sequence_parallel=self.sequence_parallel
+            x, self.weight, self.bias, process_group=self.process_group, sequence_parallel=self.sequence_parallel, gather_dim=gather_dim,
         )
 
 
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 47d706f6..56a8efac 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -405,12 +405,10 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
         if hasattr(self, "norm"):
             hidden_states = self.norm(hidden_states.float())
         if hasattr(self, "head"):
-            # if hidden_states.ndim == 3:
-            #     import pdb; pdb.set_trace()
-            #     hidden_states = self.head(hidden_states, dim=1)
-            # else:
-            #     hidden_states = self.head(hidden_states)
-            hidden_states = self.head(hidden_states)
+            if hidden_states.ndim == 3:
+                hidden_states = self.head(hidden_states, gather_dim=1)
+            else:
+                hidden_states = self.head(hidden_states)
 
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 570a86f0..33c8c46e 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -5,16 +5,18 @@
 
 import torch
 import torch.nn.functional as F
-from flash_attn.ops.fused_dense import FusedDenseFunc
+# from flash_attn.ops.fused_dense import FusedDenseFunc
 from flash_attn.utils.distributed import (
-    all_gather_raw,
+    # all_gather_raw,
     all_reduce_raw,
     reduce_scatter_raw,
 )
 from torch import Tensor
-from torch.cuda.amp import custom_bwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
+import fused_dense_lib as fused_dense_cuda
+
 from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
 
@@ -94,6 +96,109 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
     grad_bias = grad_output.sum(dim=0) if has_d_bias else None
     return grad_weight, grad_bias
 
+def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0):
+    world_size = torch.distributed.get_world_size(process_group)
+    shape = list(input_.shape)
+    shape[gather_dim] = shape[gather_dim] * world_size
+    # output = torch.empty(world_size * input_.shape[0], *input_.shape[1:],
+    #                      dtype=input_.dtype, device=input_.device)
+    output = torch.empty(shape, dtype=input_.dtype, device=input_.device)
+    handle = torch.distributed.all_gather_into_tensor(output, input_.contiguous(),
+                                                      group=process_group, async_op=async_op)
+    return output, handle
+
+class FusedDenseFunc(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None,
+                sequence_parallel=True, gather_dim=0):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
+        """
+        ctx.compute_weight_gradient = weight.requires_grad
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+        ctx.gather_dim = gather_dim
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim)
+        else:
+            total_x = x
+
+        if torch.is_autocast_enabled():
+            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
+            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+        weight = weight.contiguous()
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight.shape) > 65535 * 32:
+            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
+        output = F.linear(total_x, weight, bias)
+        if ctx.compute_weight_gradient:
+            ctx.save_for_backward(x, weight)
+        else:
+            ctx.save_for_backward(weight)
+        return output if not return_residual else (output, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            grad_input, = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        gather_dim = ctx.gather_dim
+        
+        if ctx.compute_weight_gradient:
+            x, weight = ctx.saved_tensors
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim)
+            else:
+                total_x = x
+        else:
+            weight, = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                                         grad_output, weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            if process_group is not None and sequence_parallel:
+                handle_x.wait()
+            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None, None
+
 
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFuncTorch(FusedDenseFunc):
@@ -108,10 +213,11 @@ def backward(ctx, grad_output, *args):
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
         sequence_parallel = ctx.sequence_parallel
+        gather_dim = ctx.gather_dim
         if ctx.compute_weight_gradient:
             x, weight = ctx.saved_tensors
             if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim)
             else:
                 total_x = x
         else:
@@ -144,7 +250,7 @@ def backward(ctx, grad_output, *args):
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
         if process_group is not None and ctx.needs_input_grad[0]:
             handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
 def fused_dense_func_torch(
@@ -154,14 +260,15 @@ def fused_dense_func_torch(
     return_residual: bool = False,
     process_group: Optional[ProcessGroup] = None,
     sequence_parallel: bool = True,
+    gather_dim: int = 0,
 ):
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel)
+        return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
     else:
-        return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel)
+        return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
 
 
 class _SplitForwardGatherBackward(torch.autograd.Function):

From 949431f228cdf0dbfdcd0909b905cb6075517eb6 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 18:06:22 +0800
Subject: [PATCH 006/153] modify the config

---
 configs/7B_sft.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 3e1d0780..dd4104ab 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -55,7 +55,7 @@
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
-    valid_every=10,
+    valid_every=50,
     pack_sample_into_one=False,
     total_steps=50000,
     skip_batches="",
@@ -64,7 +64,7 @@
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
+    empty_cache_and_diag_interval=10,
     diag_outlier_ratio=1.1,
 )
 
@@ -135,7 +135,7 @@
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
-    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
     use_flash_attn=True,
@@ -155,9 +155,9 @@
 """
 parallel = dict(
     zero1=-1,
-    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'
+    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
     pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=True,
+    sequence_parallel=False,
 )
 
 cudnn_deterministic = False

From 54e561665eb65f0212686051c943f73fd98c716f Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 18:08:15 +0800
Subject: [PATCH 007/153] remove useless code for no-pp

---
 internlm/core/scheduler/no_pipeline_scheduler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index 97687904..6777acc5 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -202,10 +202,8 @@ def forward_backward_step(
             if return_output_label:
                 outputs.append(_output)
                 labels.append(_label)
-        
         if not return_output_label:
             outputs, labels = None, None
-        
         # Compatible for non-moe
         if hasattr(gpc.config.model, "num_experts"):
             return outputs, labels, loss, moe_loss

From 144731c35c47171ab675e5fc9557468450a5a666 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 20:04:27 +0800
Subject: [PATCH 008/153] fix evaluation bug in pp

---
 internlm/initialize/launch.py |  2 +-
 internlm/utils/evaluation.py  | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 3651a4c7..5bd2b73c 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -283,7 +283,7 @@ def args_sanity_check():
     if gpc.config.parallel["tensor"].get("mode", None) is None:
         gpc.config.parallel["tensor"]["mode"] = "origin_tp"
     
-    if gpc.config.parallel["tensor"].get("mode", None) is 'fstp':
+    if gpc.config.parallel["tensor"].get("mode", None) == 'fstp':
         assert gpc.config.parallel.sequence_parallel is True, "when the tp_mode is fstp, the sequence_parallel should be True."
 
     # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index 2a11a478..148d19df 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -106,9 +106,15 @@ def evaluate_on_val_dls(
                         total_val_bsz = len(batch[1])
                         assert total_val_bsz % data_cfg.micro_bsz == 0
                         num_microbatches = total_val_bsz // data_cfg.micro_bsz
-                        tensor_shape = torch.Size(
-                            [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1], gpc.config.HIDDEN_SIZE]
-                        )
+                        if gpc.config.parallel['tensor']['mode'] == 'fstp':
+                            sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
+                            tensor_shape = torch.Size(
+                                [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1] // sequence_world_size, gpc.config.HIDDEN_SIZE]
+                            )
+                        else:
+                            tensor_shape = torch.Size(
+                                [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1], gpc.config.HIDDEN_SIZE]
+                            )
 
                         with switch_evaluation_pipeline_scheduler(
                             trainer=trainer,

From ef9e7cc6221823a610e3a9b0c369745d7f1e1f71 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 20:05:39 +0800
Subject: [PATCH 009/153] modify the config

---
 configs/7B_sft.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index dd4104ab..4c55feea 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -154,8 +154,8 @@
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=-1,
-    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
+    zero1=8,
+    tensor=dict(size=1, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=False,
 )

From 5d39c332fe01d08736cc42ff5613cf887d9e34b6 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 20:08:49 +0800
Subject: [PATCH 010/153] restore train.py

---
 train.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/train.py b/train.py
index 1adcc22a..139bac1f 100644
--- a/train.py
+++ b/train.py
@@ -110,6 +110,7 @@ def main(args):
 
     # initialize and resume train state
     train_state = TrainState(gpc.config, train_dl.batch_sampler)
+
     optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
 
     ckpt_manager = CheckpointManager(
@@ -169,7 +170,6 @@ def main(args):
         beta2_scheduler=beta2_scheduler,
         scheduler_hooks=scheduler_hooks,
     )
-    
 
     # initialize simple memory profiler
     if args.profiling:
@@ -219,9 +219,21 @@ def main(args):
             # do forward and backward
             timer("fwd-bwd").start()
 
-            _, _, loss = trainer.execute_schedule(
-                batch, forward_only=False, return_loss=True, return_output_label=False
-            )
+            moe_loss = None
+            if hasattr(gpc.config.model, "num_experts"):
+                _, _, loss, moe_loss = trainer.execute_schedule(
+                    batch,
+                    forward_only=False,
+                    return_loss=True,
+                    return_output_label=False,
+                )
+            else:
+                _, _, loss = trainer.execute_schedule(
+                    batch,
+                    forward_only=False,
+                    return_loss=True,
+                    return_output_label=False,
+                )
             timer("fwd-bwd").stop()
 
             # update parameters, and returns (success_update, grad_norm)
@@ -254,7 +266,7 @@ def main(args):
                 trainer=trainer,
                 start_time=start_time,
                 loss=loss,
-                moe_loss=None,
+                moe_loss=moe_loss,
                 grad_norm=grad_norm_groups,
                 metric=metric,
                 update_panel=uniscale_logger is not None,

From 29df765f65fe9797b6168008efd4dc3bf7b8cfd6 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 20:23:32 +0800
Subject: [PATCH 011/153] refactor code

---
 .../core/scheduler/no_pipeline_scheduler.py   |   2 +
 internlm/model/linear.py                      | 118 +----------
 internlm/model/utils.py                       | 183 ++++++++++++++----
 3 files changed, 151 insertions(+), 152 deletions(-)

diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index 6777acc5..56661d8c 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -202,8 +202,10 @@ def forward_backward_step(
             if return_output_label:
                 outputs.append(_output)
                 labels.append(_label)
+
         if not return_output_label:
             outputs, labels = None, None
+
         # Compatible for non-moe
         if hasattr(gpc.config.model, "num_experts"):
             return outputs, labels, loss, moe_loss
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index fbe6f141..4075e9ee 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -6,17 +6,13 @@
 import torch
 import torch.nn.functional as F
 from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
-from flash_attn.utils.distributed import all_reduce, reduce_scatter, all_gather_raw, reduce_scatter_raw
-from torch import Tensor
+from flash_attn.utils.distributed import all_reduce, reduce_scatter
 from torch import nn
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-# import fused_dense_cuda  # from apex
-import fused_dense_lib as fused_dense_cuda
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import Silu, fused_dense_func_torch
+from internlm.model.utils import Silu, fused_dense_func_torch, fsdp_fused_dense_func
 
 
 class ScaleColumnParallelLinear(nn.Linear):
@@ -208,116 +204,6 @@ def forward(self, x):
         out = self.w3(Silu(w1_o, w2_o))
         return out
 
-class FSDPFusedDenseFunc(torch.autograd.Function):
-
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
-
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.return_residual = return_residual
-        ctx.process_group = process_group
-
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        total_x = x.contiguous()
-        
-        world_size = gpc.get_world_size(ParallelMode.TENSOR)
-        if world_size > 1:
-            # do all_gather for weight and bias before actual computation
-            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-            if bias is not None:
-                total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
-                handle_bias.wait()
-            else:
-                total_bias = bias
-            handle_weight.wait()
-        else:
-            total_weight = weight
-            total_bias = bias
-
-        if torch.is_autocast_enabled():
-            total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
-       
-        total_weight = total_weight.contiguous()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        if min(batch_dim, n, *total_weight.shape) > 65535 * 32:
-            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
-        output = F.linear(total_x, total_weight, total_bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output if not return_residual else (output, x)
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output, *args):
-        grad_output = grad_output.contiguous()
-        if ctx.return_residual:
-            grad_input, = args
-            grad_input = grad_input.contiguous()
-        process_group = ctx.process_group
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            total_x = x
-        else:
-            weight, = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        
-        world_size = gpc.get_world_size(ParallelMode.TENSOR)
-        if world_size > 1:
-            # do all-gather for weight before backward
-            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-            handle_weight.wait()
-        else:
-            total_weight = weight
-        
-        if ctx.needs_input_grad[0]:
-            if not ctx.return_residual:
-                grad_input = F.linear(grad_output, total_weight.t())
-            else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
-                                         grad_output, total_weight)
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-        else:
-            grad_input = None
-
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-
-            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
-                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
-            )
-            if world_size > 1:
-                grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
-                if grad_bias is not None:
-                    grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) 
-                    handle_grad_bias.wait()
-                handle_grad_weight.wait()
-        else:
-            grad_weight = None
-            grad_bias = grad_output if ctx.needs_input_grad[2] else None
-        return grad_input, grad_weight, grad_bias, None, None, None
-
-
-def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
-                     return_residual: bool = False, process_group = None):
-    dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16]
-                      or (x.dtype == torch.float32 and torch.is_autocast_enabled()))
-    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group)
-    else:
-        assert process_group is None
-        out = F.linear(x, weight, bias)
-        return out if not return_residual else (out, x)
-
 class FSDPLinear(ColumnParallelLinear):
     
     def forward(self, x):
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 33c8c46e..c8845440 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -5,9 +5,7 @@
 
 import torch
 import torch.nn.functional as F
-# from flash_attn.ops.fused_dense import FusedDenseFunc
 from flash_attn.utils.distributed import (
-    # all_gather_raw,
     all_reduce_raw,
     reduce_scatter_raw,
 )
@@ -17,6 +15,7 @@
 
 import fused_dense_lib as fused_dense_cuda
 
+from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
 
@@ -90,23 +89,53 @@ def gather_forward_split_backward(input_, parallel_mode, dim):
     return _GatherForwardSplitBackward.apply(input_, parallel_mode, dim)
 
 
-def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
-    assert my_input.dtype == grad_output.dtype
-    grad_weight = torch.matmul(grad_output.t(), my_input)
-    grad_bias = grad_output.sum(dim=0) if has_d_bias else None
-    return grad_weight, grad_bias
+class _SplitForwardGatherBackward(torch.autograd.Function):
+    """
+    Split the input and keep only the corresponding chuck to the rank.
+
+    Args:
+        input_: input matrix.
+        parallel_mode: parallel mode.
+        dim: dimension
+    """
+
+    @staticmethod
+    def symbolic(input_):
+        return _split(input_, parallel_mode=None)
+
+    @staticmethod
+    def forward(ctx, input_, parallel_mode, dim):
+        ctx.mode = parallel_mode
+        ctx.dim = dim
+        return _split(input_, parallel_mode, dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output, ctx.mode, ctx.dim), None, None
+
+
+def split_forward_gather_backward(input_, parallel_mode, dim):
+    return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim)
+
 
 def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0):
     world_size = torch.distributed.get_world_size(process_group)
     shape = list(input_.shape)
     shape[gather_dim] = shape[gather_dim] * world_size
-    # output = torch.empty(world_size * input_.shape[0], *input_.shape[1:],
-    #                      dtype=input_.dtype, device=input_.device)
     output = torch.empty(shape, dtype=input_.dtype, device=input_.device)
     handle = torch.distributed.all_gather_into_tensor(output, input_.contiguous(),
                                                       group=process_group, async_op=async_op)
     return output, handle
 
+
+def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
+    assert my_input.dtype == grad_output.dtype
+    grad_weight = torch.matmul(grad_output.t(), my_input)
+    grad_bias = grad_output.sum(dim=0) if has_d_bias else None
+    return grad_weight, grad_bias
+
+
+# adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFunc(torch.autograd.Function):
 
     @staticmethod
@@ -253,6 +282,105 @@ def backward(ctx, grad_output, *args):
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
+class FSDPFusedDenseFunc(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
+
+        ctx.compute_weight_gradient = weight.requires_grad
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        total_x = x.contiguous()
+        
+        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        if world_size > 1:
+            # do all_gather for weight and bias before actual computation
+            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+            if bias is not None:
+                total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
+                handle_bias.wait()
+            else:
+                total_bias = bias
+            handle_weight.wait()
+        else:
+            total_weight = weight
+            total_bias = bias
+
+        if torch.is_autocast_enabled():
+            total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+       
+        total_weight = total_weight.contiguous()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *total_weight.shape) > 65535 * 32:
+            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
+        output = F.linear(total_x, total_weight, total_bias)
+        if ctx.compute_weight_gradient:
+            ctx.save_for_backward(x, weight)
+        else:
+            ctx.save_for_backward(weight)
+        return output if not return_residual else (output, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            grad_input, = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        if ctx.compute_weight_gradient:
+            x, weight = ctx.saved_tensors
+            total_x = x
+        else:
+            weight, = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        
+        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        if world_size > 1:
+            # do all-gather for weight before backward
+            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+            handle_weight.wait()
+        else:
+            total_weight = weight
+        
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, total_weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                                         grad_output, total_weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+        else:
+            grad_input = None
+
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+
+            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+            if world_size > 1:
+                grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                if grad_bias is not None:
+                    grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) 
+                    handle_grad_bias.wait()
+                handle_grad_weight.wait()
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        return grad_input, grad_weight, grad_bias, None, None, None
+
+
 def fused_dense_func_torch(
     x: Tensor,
     weight: Tensor,
@@ -271,33 +399,16 @@ def fused_dense_func_torch(
         return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
 
 
-class _SplitForwardGatherBackward(torch.autograd.Function):
-    """
-    Split the input and keep only the corresponding chuck to the rank.
-
-    Args:
-        input_: input matrix.
-        parallel_mode: parallel mode.
-        dim: dimension
-    """
-
-    @staticmethod
-    def symbolic(input_):
-        return _split(input_, parallel_mode=None)
-
-    @staticmethod
-    def forward(ctx, input_, parallel_mode, dim):
-        ctx.mode = parallel_mode
-        ctx.dim = dim
-        return _split(input_, parallel_mode, dim)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather(grad_output, ctx.mode, ctx.dim), None, None
-
-
-def split_forward_gather_backward(input_, parallel_mode, dim):
-    return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim)
+def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
+                     return_residual: bool = False, process_group = None):
+    dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16]
+                      or (x.dtype == torch.float32 and torch.is_autocast_enabled()))
+    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
+        return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group)
+    else:
+        assert process_group is None
+        out = F.linear(x, weight, bias)
+        return out if not return_residual else (out, x)
 
 
 def try_import_RMSNorm():

From f191853bf40e7c367161b5cd7fa3e1d1c321605b Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 20:39:57 +0800
Subject: [PATCH 012/153] fix lint

---
 internlm/initialize/launch.py          | 10 +++--
 internlm/model/linear.py               | 42 ++++++-----------
 internlm/model/modeling_internlm.py    | 20 +++++----
 internlm/model/multi_head_attention.py | 62 +++++++++++++-------------
 internlm/model/utils.py                | 62 +++++++++++++-------------
 internlm/utils/evaluation.py           | 10 +++--
 6 files changed, 99 insertions(+), 107 deletions(-)

diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 5bd2b73c..8c224bf8 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -279,12 +279,14 @@ def args_sanity_check():
         assert not (
             gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
         ), "sequence parallel does not support use_flash_attn=False"
-    
+
     if gpc.config.parallel["tensor"].get("mode", None) is None:
         gpc.config.parallel["tensor"]["mode"] = "origin_tp"
-    
-    if gpc.config.parallel["tensor"].get("mode", None) == 'fstp':
-        assert gpc.config.parallel.sequence_parallel is True, "when the tp_mode is fstp, the sequence_parallel should be True."
+
+    if gpc.config.parallel["tensor"].get("mode", None) == "fstp":
+        assert (
+            gpc.config.parallel.sequence_parallel is True
+        ), "when the tp_mode is fstp, the sequence_parallel should be True."
 
     # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy
     if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1:
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 4075e9ee..8e23871a 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -9,10 +9,9 @@
 from flash_attn.utils.distributed import all_reduce, reduce_scatter
 from torch import nn
 
-
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import Silu, fused_dense_func_torch, fsdp_fused_dense_func
+from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch
 
 
 class ScaleColumnParallelLinear(nn.Linear):
@@ -124,7 +123,12 @@ def forward(self, x, gather_dim=0):
         # If not, then the input is already gathered.
 
         return fused_dense_func_torch(
-            x, self.weight, self.bias, process_group=self.process_group, sequence_parallel=self.sequence_parallel, gather_dim=gather_dim,
+            x,
+            self.weight,
+            self.bias,
+            process_group=self.process_group,
+            sequence_parallel=self.sequence_parallel,
+            gather_dim=gather_dim,
         )
 
 
@@ -204,31 +208,13 @@ def forward(self, x):
         out = self.w3(Silu(w1_o, w2_o))
         return out
 
-class FSDPLinear(ColumnParallelLinear):
-    
-    def forward(self, x):
-        return fsdp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group)
-
 
-class FSDPScaleLinear(ScaleColumnParallelLinear):
-    
-    def forward(self, input):  # pylint: disable=W0622
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        if self.weight_scale != 1:
-            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
-        else:
-            weight = self.weight
-        return fsdp_fused_dense_func(
-            input,
-            weight,
-            self.bias,
-            process_group=self.process_group,
-        )
+class FSTPLinear(ColumnParallelLinear):
+    def forward(self, x):
+        return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group)
 
 
-class FSDPFeedForward(nn.Module):
+class FSTPFeedForward(nn.Module):
     """
     FeedForward.
 
@@ -259,7 +245,7 @@ def __init__(
 
         hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
 
-        self.w1 = FSDPLinear(
+        self.w1 = FSTPLinear(
             in_features,
             hidden_features,
             process_group,
@@ -268,7 +254,7 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        self.w2 = FSDPLinear(
+        self.w2 = FSTPLinear(
             in_features,
             hidden_features,
             process_group,
@@ -277,7 +263,7 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        self.w3 = FSDPLinear(
+        self.w3 = FSTPLinear(
             hidden_features,
             out_features,
             process_group,
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 56a8efac..b8d7e60d 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -15,13 +15,16 @@
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
     FeedForward,
+    FSTPFeedForward,
     RewardModelLinear,
     ScaleColumnParallelLinear,
-    FSDPScaleLinear,
-    FSDPFeedForward,
 )
 from internlm.model.multi_head_attention import MHA
-from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm, split_forward_gather_backward
+from internlm.model.utils import (
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+    try_import_RMSNorm,
+)
 from internlm.solver.pipeline_utils import partition_uniform
 from internlm.utils.checkpoint import activation_checkpoint
 from internlm.utils.common import filter_kwargs
@@ -74,7 +77,7 @@ def __init__(
         use_scaled_init: bool = True,
         use_swiglu: bool = True,
         use_flash_attn: bool = True,
-        tp_mode: str = 'origin_tp',
+        tp_mode: str = "origin_tp",
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -111,7 +114,7 @@ def __init__(
             self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
 
         if use_swiglu:
-            mlp_cls = FeedForward if tp_mode == 'origin_tp' else FSDPFeedForward
+            mlp_cls = FeedForward if tp_mode == "origin_tp" else FSTPFeedForward
             self.mlp = mlp_cls(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
@@ -173,7 +176,6 @@ def reset_parameters(self):
                     else:
                         normal_(std=0.006 if "fc1" in name else 0.0015)(param.data)
 
-
     def forward(self, hidden_states, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None):
         if self.checkpoint and self.training:
             return activation_checkpoint(
@@ -341,7 +343,7 @@ def __init__(
                     use_scaled_init=use_scaled_init,
                     use_swiglu=use_swiglu,
                     use_flash_attn=use_flash_attn,
-                    tp_mode = self.tp_mode,
+                    tp_mode=self.tp_mode,
                 )
                 for lid in range(num_layers)
             ]
@@ -388,9 +390,9 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
             # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension.
-            if gpc.config.parallel.sequence_parallel and self.tp_mode == 'fstp':
+            if gpc.config.parallel.sequence_parallel and self.tp_mode == "fstp":
                 indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
-        
+
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
 
         for _, block in enumerate(self.blocks):
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 8f7a064d..287a0e2d 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -2,9 +2,10 @@
 # -*- encoding: utf-8 -*-
 
 import warnings
-from typing import Optional
+from typing import Any, Optional, Tuple
 
 import torch
+import torch.distributed as dist
 from einops import rearrange
 from flash_attn.modules.mha import (
     CrossAttention,
@@ -13,26 +14,25 @@
     SelfAttention,
     _update_kv_cache,
 )
-from torch import nn
+from torch import Tensor, nn
+from torch.nn import Module
 
 from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding
-from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch, FSDPLinear
-
-import torch
-
-from typing import Any, Tuple
-from torch import Tensor
-from torch.nn import Module
-
-import torch.distributed as dist
+from internlm.model.linear import (
+    ColumnParallelLinearTorch,
+    FSTPLinear,
+    RowParallelLinearTorch,
+)
 
 
+# adpated from https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py
 class _SeqAllToAll(torch.autograd.Function):
+    "sequence alltoall"
 
     @staticmethod
-    def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
+    def forward(ctx: Any, group: dist.ProcessGroup, input_: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
 
         ctx.group = group
         ctx.scatter_idx = scatter_idx
@@ -40,7 +40,7 @@ def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int,
 
         seq_world_size = dist.get_world_size(group)
 
-        input_list = [t.contiguous() for t in torch.tensor_split(input, seq_world_size, scatter_idx)]
+        input_list = [t.contiguous() for t in torch.tensor_split(input_, seq_world_size, scatter_idx)]
         output_list = [torch.empty_like(input_list[0]) for _ in range(seq_world_size)]
         # TODO Use all_to_all_single instead
         dist.all_to_all(output_list, input_list, group=group)
@@ -51,6 +51,7 @@ def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
         return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None)
 
 
+# adpated from https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py
 class DistributedAttention(torch.nn.Module):
     """Initialization.
 
@@ -73,16 +74,16 @@ def __init__(
         second_gather_idx: int = 1,
     ) -> None:
 
-        super(DistributedAttention, self).__init__()
+        super().__init__()
         self.local_attn = local_attention
         self.spg = sequence_process_group
         self.first_scatter_idx = first_scatter_idx
         self.first_gather_idx = first_gather_idx
         self.second_scatter_idx = second_scatter_idx
         self.second_gather_idx = second_gather_idx
-    
+
     def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor:
-        """ forward
+        """forward
 
         Arguments:
             query (Tensor): query input to the layer
@@ -93,24 +94,25 @@ def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor:
         Returns:
             * output (Tensor): context output
         """
-        # TODO Merge three alltoall calls into one
+        # Evaluation
         if qkv.ndim == 5:
-            # in shape: [seq/tp_size, 3, head, head_dim]
+            # in shape: [batch, seq/tp_size, 3, head, head_dim]
             qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx + 1, self.first_gather_idx + 1)
-            #out shape : [seq, head/tp_size, head_dim]
+            # out shape : [batch, seq, head/tp_size, head_dim]
             context_layer = self.local_attn(qkv, **kwargs)
-            # in shape: [seq, head/tp_size, head_dim]
-            output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1)
-        else:
-            
+            # in shape: [batch, seq, head/tp_size, head_dim]
+            output = _SeqAllToAll.apply(
+                self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1
+            )
+        else:  # training
             # in shape: [seq/tp_size, 3, head, head_dim]
             qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx, self.first_gather_idx)
-            #out shape : [seq, head/tp_size, head_dim]
+            # out shape : [seq, head/tp_size, head_dim]
             context_layer = self.local_attn(qkv, **kwargs)
             # in shape: [seq, head/tp_size, head_dim]
             output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx, self.second_gather_idx)
 
-        #out e.g., [s/p::h]
+        # out e.g., [s/p::h]
         return output
 
 
@@ -157,7 +159,7 @@ def __init__(
         use_flash_attn: bool = True,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
-        tp_mode: str = 'origin_tp',
+        tp_mode: str = "origin_tp",
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
@@ -185,7 +187,7 @@ def __init__(
                 self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)
 
         # notice here should change bias=True
-        Wqkv_cls = ColumnParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear
+        Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
         self.Wqkv = Wqkv_cls(
             embed_dim,
             3 * embed_dim,
@@ -201,12 +203,12 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        if tp_mode == 'fstp':
+        if tp_mode == "fstp":
             self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group)
             self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group)
 
         # output projection always have the bias (for now)
-        out_proj_cls = RowParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear
+        out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
         self.out_proj = out_proj_cls(
             embed_dim,
             embed_dim,
@@ -214,7 +216,6 @@ def __init__(
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             **factory_kwargs,
         )
-        
         # need to assign tp attribute so that internlm know it is tensor parallel module
         if gpc.get_world_size(ParallelMode.TENSOR) > 1:
             for name in ["out_proj", "Wqkv"]:
@@ -311,7 +312,6 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
         qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim)  # total x 3 x n_head x d
         qkv = self.rotary_emb(qkv, **kwargs)
         kwargs.pop("indexes")
-        
         if inference_params is None:
             if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index c8845440..67e89ad1 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -3,18 +3,14 @@
 
 from typing import Optional
 
+import fused_dense_lib as fused_dense_cuda
 import torch
 import torch.nn.functional as F
-from flash_attn.utils.distributed import (
-    all_reduce_raw,
-    reduce_scatter_raw,
-)
+from flash_attn.utils.distributed import all_reduce_raw, reduce_scatter_raw
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
-import fused_dense_lib as fused_dense_cuda
-
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
@@ -123,8 +119,9 @@ def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool =
     shape = list(input_.shape)
     shape[gather_dim] = shape[gather_dim] * world_size
     output = torch.empty(shape, dtype=input_.dtype, device=input_.device)
-    handle = torch.distributed.all_gather_into_tensor(output, input_.contiguous(),
-                                                      group=process_group, async_op=async_op)
+    handle = torch.distributed.all_gather_into_tensor(
+        output, input_.contiguous(), group=process_group, async_op=async_op
+    )
     return output, handle
 
 
@@ -137,11 +134,11 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
 
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFunc(torch.autograd.Function):
+    "tp fused dense function"
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None,
-                sequence_parallel=True, gather_dim=0):
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0):
         """
         If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
         with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
@@ -171,7 +168,7 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None,
         batch_dim = batch_shape.numel()
         # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
         if min(batch_dim, n, *weight.shape) > 65535 * 32:
-            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
+            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
         output = F.linear(total_x, weight, bias)
         if ctx.compute_weight_gradient:
             ctx.save_for_backward(x, weight)
@@ -184,12 +181,12 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None,
     def backward(ctx, grad_output, *args):
         grad_output = grad_output.contiguous()
         if ctx.return_residual:
-            grad_input, = args
+            (grad_input,) = args
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
         sequence_parallel = ctx.sequence_parallel
         gather_dim = ctx.gather_dim
-        
+
         if ctx.compute_weight_gradient:
             x, weight = ctx.saved_tensors
             if process_group is not None and sequence_parallel:
@@ -197,7 +194,7 @@ def backward(ctx, grad_output, *args):
             else:
                 total_x = x
         else:
-            weight, = ctx.saved_tensors
+            (weight,) = ctx.saved_tensors
             total_x = None
         batch_shape = grad_output.shape[:-1]
         batch_dim = batch_shape.numel()
@@ -206,8 +203,7 @@ def backward(ctx, grad_output, *args):
             if not ctx.return_residual:
                 grad_input = F.linear(grad_output, weight.t())
             else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
-                                         grad_output, weight)
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
             if process_group is not None:
                 reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
@@ -282,7 +278,8 @@ def backward(ctx, grad_output, *args):
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
-class FSDPFusedDenseFunc(torch.autograd.Function):
+class FSTPFusedDenseFunc(torch.autograd.Function):
+    "FSTP fused dense function"
 
     @staticmethod
     @custom_fwd
@@ -295,7 +292,7 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
         total_x = x.contiguous()
-        
+
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
@@ -313,13 +310,13 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
         if torch.is_autocast_enabled():
             total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype())
             total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
-       
+
         total_weight = total_weight.contiguous()
         batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
         batch_dim = batch_shape.numel()
         # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
         if min(batch_dim, n, *total_weight.shape) > 65535 * 32:
-            raise RuntimeError('fused_dense only supports matrix dims <= 2M')
+            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
         output = F.linear(total_x, total_weight, total_bias)
         if ctx.compute_weight_gradient:
             ctx.save_for_backward(x, weight)
@@ -332,19 +329,19 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
     def backward(ctx, grad_output, *args):
         grad_output = grad_output.contiguous()
         if ctx.return_residual:
-            grad_input, = args
+            (grad_input,) = args
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
         if ctx.compute_weight_gradient:
             x, weight = ctx.saved_tensors
             total_x = x
         else:
-            weight, = ctx.saved_tensors
+            (weight,) = ctx.saved_tensors
             total_x = None
         batch_shape = grad_output.shape[:-1]
         batch_dim = batch_shape.numel()
         grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        
+
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             # do all-gather for weight before backward
@@ -352,13 +349,12 @@ def backward(ctx, grad_output, *args):
             handle_weight.wait()
         else:
             total_weight = weight
-        
+
         if ctx.needs_input_grad[0]:
             if not ctx.return_residual:
                 grad_input = F.linear(grad_output, total_weight.t())
             else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]),
-                                         grad_output, total_weight)
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight)
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
         else:
             grad_input = None
@@ -372,7 +368,7 @@ def backward(ctx, grad_output, *args):
             if world_size > 1:
                 grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
                 if grad_bias is not None:
-                    grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) 
+                    grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
                     handle_grad_bias.wait()
                 handle_grad_weight.wait()
         else:
@@ -399,12 +395,14 @@ def fused_dense_func_torch(
         return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
 
 
-def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
-                     return_residual: bool = False, process_group = None):
-    dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16]
-                      or (x.dtype == torch.float32 and torch.is_autocast_enabled()))
+def fstp_fused_dense_func(
+    x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None
+):
+    dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
+        x.dtype == torch.float32 and torch.is_autocast_enabled()
+    )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group)
+        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group)
     else:
         assert process_group is None
         out = F.linear(x, weight, bias)
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index 148d19df..968a1db1 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -54,7 +54,7 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
 def switch_sequence_parallel_mode():
     prev_mode = gpc.config.parallel.sequence_parallel
     try:
-        if gpc.config.parallel["tensor"]["mode"] == 'fstp':
+        if gpc.config.parallel["tensor"]["mode"] == "fstp":
             gpc.config.parallel.sequence_parallel = True
         else:
             gpc.config.parallel.sequence_parallel = False
@@ -106,10 +106,14 @@ def evaluate_on_val_dls(
                         total_val_bsz = len(batch[1])
                         assert total_val_bsz % data_cfg.micro_bsz == 0
                         num_microbatches = total_val_bsz // data_cfg.micro_bsz
-                        if gpc.config.parallel['tensor']['mode'] == 'fstp':
+                        if gpc.config.parallel["tensor"]["mode"] == "fstp":
                             sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
                             tensor_shape = torch.Size(
-                                [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1] // sequence_world_size, gpc.config.HIDDEN_SIZE]
+                                [
+                                    data_cfg.micro_bsz,
+                                    batch[0]["input_ids"].shape[1] // sequence_world_size,
+                                    gpc.config.HIDDEN_SIZE,
+                                ]
                             )
                         else:
                             tensor_shape = torch.Size(

From a8dea6313fe85e6e762177c34b7786657fca89b1 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 21:33:26 +0800
Subject: [PATCH 013/153] fix the ci incompatible in config

---
 internlm/initialize/launch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 6a094e77..e5bd8610 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -305,9 +305,12 @@ def args_sanity_check():
             gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
         ), "sequence parallel does not support use_flash_attn=False"
 
+    if isinstance (gpc.config.parallel["tensor"], int):
+        gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode='origin_tp')
+
     if gpc.config.parallel["tensor"].get("mode", None) is None:
         gpc.config.parallel["tensor"]["mode"] = "origin_tp"
-
+    
     if gpc.config.parallel["tensor"].get("mode", None) == "fstp":
         assert (
             gpc.config.parallel.sequence_parallel is True

From 1b7935dd98d7879ae7effd1723ffa70e32869c5e Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 21:35:52 +0800
Subject: [PATCH 014/153] merge upstream develop

---
 internlm/model/multi_head_attention.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 287a0e2d..49578d77 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.distributed as dist
+import torch.nn.functional as F
 from einops import rearrange
 from flash_attn.modules.mha import (
     CrossAttention,

From db637542a614468365c2a9a2e2f6a720c158f11f Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 22:19:21 +0800
Subject: [PATCH 015/153] fix lint

---
 internlm/initialize/launch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index e5bd8610..80611fee 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -305,12 +305,12 @@ def args_sanity_check():
             gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
         ), "sequence parallel does not support use_flash_attn=False"
 
-    if isinstance (gpc.config.parallel["tensor"], int):
-        gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode='origin_tp')
+    if isinstance(gpc.config.parallel["tensor"], int):
+        gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="origin_tp")
 
     if gpc.config.parallel["tensor"].get("mode", None) is None:
         gpc.config.parallel["tensor"]["mode"] = "origin_tp"
-    
+
     if gpc.config.parallel["tensor"].get("mode", None) == "fstp":
         assert (
             gpc.config.parallel.sequence_parallel is True

From 5fb6d99c112dbbc61bd95977dd13bd112a3e03f0 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 10 Oct 2023 11:45:11 +0800
Subject: [PATCH 016/153] feat(configs/7B_sft.py): update parallel config
 comment

---
 configs/7B_sft.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index d8557007..dee2f5eb 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -142,20 +142,27 @@
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
 )
 """
-zero1 parallel:
-    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
-        so parameters will be divided within the range of dp.
-    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
 pipeline parallel (dict):
     1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
-tensor parallel: tensor parallel size, usually the number of GPUs per node.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
 """
 parallel = dict(
     zero1=dict(size=8, fsdp=False),
-    tensor=dict(size=1, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
+    tensor=dict(size=1, mode="origin_tp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=False,
 )

From 0fac845c3664bef850b8762526a55a1da9467206 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 10 Oct 2023 17:06:13 +0800
Subject: [PATCH 017/153] overlap grad_input computation and grad_weight
 reduce_scatter

---
 configs/7B_sft.py                   |  8 ++++----
 internlm/model/modeling_internlm.py |  5 +++--
 internlm/model/utils.py             | 29 +++++++++++++++++------------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index d8557007..ac491215 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -1,7 +1,7 @@
 JOB_NAME = "7b_train"
 DO_ALERT = False
 
-SEQ_LEN = 2048
+SEQ_LEN = 4096
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
@@ -154,10 +154,10 @@
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=dict(size=8, fsdp=False),
-    tensor=dict(size=1, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
+    zero1=dict(size=1, fsdp=False),
+    tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
     pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=False,
+    sequence_parallel=True,
 )
 
 cudnn_deterministic = False
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index b8d7e60d..228dbd34 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -407,10 +407,11 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
         if hasattr(self, "norm"):
             hidden_states = self.norm(hidden_states.float())
         if hasattr(self, "head"):
+            # Evaluation
             if hidden_states.ndim == 3:
                 hidden_states = self.head(hidden_states, gather_dim=1)
-            else:
-                hidden_states = self.head(hidden_states)
+            else: # Training
+                hidden_states = self.head(hidden_states, gather_dim=0)
 
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 67e89ad1..3885488b 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -349,16 +349,8 @@ def backward(ctx, grad_output, *args):
             handle_weight.wait()
         else:
             total_weight = weight
-
-        if ctx.needs_input_grad[0]:
-            if not ctx.return_residual:
-                grad_input = F.linear(grad_output, total_weight.t())
-            else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight)
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-        else:
-            grad_input = None
-
+        
+        # compute weight grad
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
 
@@ -369,11 +361,24 @@ def backward(ctx, grad_output, *args):
                 grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
                 if grad_bias is not None:
                     grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
-                    handle_grad_bias.wait()
-                handle_grad_weight.wait()
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
+
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, total_weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+        else:
+            grad_input = None
+        
+        if ctx.needs_input_grad[1]:
+            if world_size > 1:
+                handle_grad_weight.wait()
+                if grad_bias is not None:
+                    handle_grad_bias.wait()
         return grad_input, grad_weight, grad_bias, None, None, None
 
 

From 792b066f151c438a6ba653a8aafe9207a459907a Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Wed, 11 Oct 2023 10:57:12 +0800
Subject: [PATCH 018/153] communication overlap

---
 configs/7B_sft.py        |  2 +-
 internlm/model/linear.py | 74 +++++++++++++++++++++++++++++++++++++++-
 internlm/model/utils.py  |  6 ++--
 3 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 814966b1..e8be1677 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -162,7 +162,7 @@
 """
 parallel = dict(
     zero1=dict(size=1, fsdp=False),
-    tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
+    tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 8e23871a..36f64f33 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional
+from typing import Optional, Union, Any
 
 import torch
 import torch.nn.functional as F
@@ -211,6 +211,7 @@ def forward(self, x):
 
 class FSTPLinear(ColumnParallelLinear):
     def forward(self, x):
+        import pdb; pdb.set_trace()
         return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group)
 
 
@@ -278,3 +279,74 @@ def forward(self, x):
         w2_o = self.w2(x)
         out = self.w3(F.silu(w1_o) * w2_o)
         return out
+
+class FSTPAllGatherSyncHandler:
+    """
+    All-gather handler for overlapping the all-gather in adjcent FSTP linear.
+    """
+    
+    def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
+        
+        self.process_group = process_group
+        self.FSTP_modules = []
+        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
+        self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward
+        self.module_handler = dict() # key: FSTP module; value: all-gather handler
+        self.module_block = dict() # key: FSTP module; value: transformer block index
+        self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module}
+        self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name
+        
+        # just want to share same for loop for ModuleList and Module
+        if not isinstance(model, nn.ModuleList):
+            model = [model]
+        
+        for _chunk in model:
+            if isinstance(_chunk, NaiveAMPModel):
+                _chunk = _chunk.model
+            
+            for _, children in _chunk.named_children():
+                if isinstance(children, nn.ModuleList):
+                    for _, block in enumerate(children):
+                        index = 0
+                        sub_modules = list(block.children())
+                        if len(sub_modules) > 0:
+                            for name, child in block.named_children():
+                                if isinstance(child, FSTPLinear):
+                                    self.FSTP_modules.append(child)
+                                    self.module_block[child] = _
+                                    self.block_module[_][index] = child
+                                    self.module_name_index[child] = index
+                                    index = index + 1
+                        else:
+                            continue
+        
+    
+    def _register_sync_parameters_hook(self) -> None:
+        """
+        register pre_forward_hook and pre_backward_hook for FSTPLinear.
+        """
+        
+        def _hook(module: nn.Module):
+            block_index = self.module_block[module]
+            name_index = self.module_name_index[module]
+            if name_index == 0:
+                next_module = self.block_module[block_index][name_index + 1]
+                self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                self.module_handler[next_module] = weights_handler
+            else:
+                handler = self.module_handler[module]
+                handler.wait()
+                if name_index != 4:
+                    next_module = self.block_module[block_index][name_index + 1]
+                    self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                    self.module_handler[next_module] = weights_handler
+        
+        def _pre_forward_hook(module: nn.Module, inputs: Any):
+            _hook(module)
+
+        def _pre_backward_hook(module: nn.Module, grad_input, grad_output):
+            _hook(module)
+        
+        for module in self.FSTP_modules:
+            module.register_forward_pre_hook(_pre_forward_hook)
+            module.register_backward_pre_hook(_pre_backward_hook)
\ No newline at end of file
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 3885488b..5768f000 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional
+from typing import Any, Optional, Union
 
 import fused_dense_lib as fused_dense_cuda
 import torch
@@ -379,7 +379,7 @@ def backward(ctx, grad_output, *args):
                 handle_grad_weight.wait()
                 if grad_bias is not None:
                     handle_grad_bias.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None
 
 
 def fused_dense_func_torch(
@@ -453,3 +453,5 @@ def Silu(w1_o, w2_o):
 
 
 Silu = torch.jit.script(Silu)
+
+

From 5fd5a8a32b0e045a499e8beb3f0438cb0bd49408 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Wed, 11 Oct 2023 17:36:41 +0800
Subject: [PATCH 019/153] support fine-grained overlap

---
 configs/7B_sft.py                      |  2 +-
 internlm/model/linear.py               | 78 ++++++++++++++++++--------
 internlm/model/multi_head_attention.py |  3 +-
 internlm/model/utils.py                | 35 +++++++-----
 internlm/train/training_internlm.py    |  8 ++-
 5 files changed, 86 insertions(+), 40 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index e8be1677..814966b1 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -162,7 +162,7 @@
 """
 parallel = dict(
     zero1=dict(size=1, fsdp=False),
-    tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
+    tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 36f64f33..42bd9f03 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -11,7 +11,8 @@
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch
+from internlm.core.naive_amp import NaiveAMPModel
+from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch, all_gather_raw
 
 
 class ScaleColumnParallelLinear(nn.Linear):
@@ -211,8 +212,7 @@ def forward(self, x):
 
 class FSTPLinear(ColumnParallelLinear):
     def forward(self, x):
-        import pdb; pdb.set_trace()
-        return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group)
+        return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler)
 
 
 class FSTPFeedForward(nn.Module):
@@ -287,6 +287,7 @@ class FSTPAllGatherSyncHandler:
     
     def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
         
+        # import pdb; pdb.set_trace()
         self.process_group = process_group
         self.FSTP_modules = []
         self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
@@ -306,19 +307,21 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
             
             for _, children in _chunk.named_children():
                 if isinstance(children, nn.ModuleList):
-                    for _, block in enumerate(children):
+                    for idx, block in enumerate(children):
                         index = 0
-                        sub_modules = list(block.children())
-                        if len(sub_modules) > 0:
-                            for name, child in block.named_children():
-                                if isinstance(child, FSTPLinear):
-                                    self.FSTP_modules.append(child)
-                                    self.module_block[child] = _
-                                    self.block_module[_][index] = child
-                                    self.module_name_index[child] = index
-                                    index = index + 1
-                        else:
-                            continue
+                        self.block_module[idx] = {}
+                        for _, sub in block.named_children():
+                            sub_modules = list(sub.children())
+                            if len(sub_modules) > 0:
+                                for name, child in sub.named_children():
+                                    if isinstance(child, FSTPLinear):
+                                        self.FSTP_modules.append(child)
+                                        self.module_block[child] = idx
+                                        self.block_module[idx][index] = child
+                                        self.module_name_index[child] = index
+                                        index = index + 1
+                            else:
+                                continue
         
     
     def _register_sync_parameters_hook(self) -> None:
@@ -326,27 +329,58 @@ def _register_sync_parameters_hook(self) -> None:
         register pre_forward_hook and pre_backward_hook for FSTPLinear.
         """
         
-        def _hook(module: nn.Module):
+        def _pre_forward_hook(module: nn.Module, inputs: Any):
             block_index = self.module_block[module]
             name_index = self.module_name_index[module]
             if name_index == 0:
+                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                weight_handler.wait()
+                self.FSTP_global_weights[module] = total_weight
+                
+                # start the all-gather for next module
                 next_module = self.block_module[block_index][name_index + 1]
-                self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
                 self.module_handler[next_module] = weights_handler
             else:
                 handler = self.module_handler[module]
                 handler.wait()
                 if name_index != 4:
                     next_module = self.block_module[block_index][name_index + 1]
-                    self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
                     self.module_handler[next_module] = weights_handler
         
-        def _pre_forward_hook(module: nn.Module, inputs: Any):
-            _hook(module)
+        def _post_forward_hook(module: nn.Module, input, output):
+            del self.FSTP_global_weights[module]
+            del self.module_handler[module]
 
         def _pre_backward_hook(module: nn.Module, grad_input, grad_output):
-            _hook(module)
+            block_index = self.module_block[module]
+            name_index = self.module_name_index[module]
+            if name_index == 4:
+                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                weight_handler.wait()
+                self.FSTP_global_weights[module] = total_weight
+                
+                # start the all-gather for next module
+                next_module = self.block_module[block_index][name_index - 1]
+                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                self.module_handler[next_module] = weights_handler
+            else:
+                handler = self.module_handler[module]
+                handler.wait()
+                if name_index != 0:
+                    next_module = self.block_module[block_index][name_index - 1]
+                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                    self.module_handler[next_module] = weights_handler
+        
+        def _post_backward_hook(module, grad_input, grad_output):
+            del self.FSTP_global_weights[module]
         
         for module in self.FSTP_modules:
+            # import pdb; pdb.set_trace()
             module.register_forward_pre_hook(_pre_forward_hook)
-            module.register_backward_pre_hook(_pre_backward_hook)
\ No newline at end of file
+            module.register_forward_hook(_post_forward_hook)
+            # module.register_backward_pre_hook(_pre_backward_hook)
+            # module.register_backward_hook(_post_backward_hook)
+            module.register_module_full_backward_pre_hook(_pre_backward_hook)
+            
\ No newline at end of file
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 436caf77..1db98d7e 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -210,7 +210,7 @@ def __init__(
             embed_dim,
             3 * embed_dim,
             process_group,
-            bias=True,
+            bias=False,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             **factory_kwargs,
         )  # according to https://spaces.ac.cn/archives/9577
@@ -231,6 +231,7 @@ def __init__(
             embed_dim,
             embed_dim,
             process_group,
+            bias=False,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             **factory_kwargs,
         )
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 5768f000..50b9bbd7 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -283,11 +283,13 @@ class FSTPFusedDenseFunc(torch.autograd.Function):
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None):
 
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
         ctx.process_group = process_group
+        ctx.all_gather_handler = all_gather_handler
+        ctx.module = module
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
@@ -295,14 +297,16 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None):
 
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
-            # do all_gather for weight and bias before actual computation
-            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-            if bias is not None:
-                total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
-                handle_bias.wait()
-            else:
-                total_bias = bias
-            handle_weight.wait()
+            total_weight = all_gather_handler.FSTP_global_weights[module]
+            total_bias = bias
+            # # do all_gather for weight and bias before actual computation
+            # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+            # if bias is not None:
+            #     total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
+            #     handle_bias.wait()
+            # else:
+            #     total_bias = bias
+            # handle_weight.wait()
         else:
             total_weight = weight
             total_bias = bias
@@ -332,6 +336,8 @@ def backward(ctx, grad_output, *args):
             (grad_input,) = args
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
+        all_gather_handler = ctx.all_gather_handler
+        module = ctx.module
         if ctx.compute_weight_gradient:
             x, weight = ctx.saved_tensors
             total_x = x
@@ -345,8 +351,9 @@ def backward(ctx, grad_output, *args):
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             # do all-gather for weight before backward
-            total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-            handle_weight.wait()
+            # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+            # handle_weight.wait()
+            total_weight = all_gather_handler.FSTP_global_weights[module]
         else:
             total_weight = weight
         
@@ -379,7 +386,7 @@ def backward(ctx, grad_output, *args):
                 handle_grad_weight.wait()
                 if grad_bias is not None:
                     handle_grad_bias.wait()
-        return grad_input, grad_weight, grad_bias, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
 def fused_dense_func_torch(
@@ -401,13 +408,13 @@ def fused_dense_func_torch(
 
 
 def fstp_fused_dense_func(
-    x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None
+    x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None, module=None, handler=None
 ):
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group)
+        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler)
     else:
         assert process_group is None
         out = F.linear(x, weight, bias)
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 7af58ddf..5deb0233 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -39,6 +39,7 @@
     FeedForward,
     RewardModelLinear,
     ScaleColumnParallelLinear,
+    FSTPAllGatherSyncHandler,
 )
 from internlm.model.multi_head_attention import MHA
 from internlm.model.utils import try_import_RMSNorm
@@ -106,10 +107,13 @@ def initialize_model():
 
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
-
+    
+    if gpc.config.parallel["tensor"]["mode"] == "fstp":
+        handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
+        handler._register_sync_parameters_hook()
+        gpc.config.fstp_handler = handler
     return model
 
-
 def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
     if gpc.config.parallel.zero1.fsdp:
         # set wrap_policy for fsdp wrap

From d0b1346993a493f3c7b5d1b109eba41731711002 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 12 Oct 2023 19:42:08 +0800
Subject: [PATCH 020/153] feat(model/linear.py): support block allgather
 overlap

---
 internlm/model/linear.py            | 207 ++++++++++++++++++++++++----
 internlm/model/utils.py             |  45 +++---
 internlm/train/training_internlm.py |   8 +-
 3 files changed, 212 insertions(+), 48 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 42bd9f03..3e37863d 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional, Union, Any
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -12,7 +12,12 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
-from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch, all_gather_raw
+from internlm.model.utils import (
+    Silu,
+    all_gather_raw,
+    fstp_fused_dense_func,
+    fused_dense_func_torch,
+)
 
 
 class ScaleColumnParallelLinear(nn.Linear):
@@ -212,7 +217,9 @@ def forward(self, x):
 
 class FSTPLinear(ColumnParallelLinear):
     def forward(self, x):
-        return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler)
+        return fstp_fused_dense_func(
+            x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler
+        )
 
 
 class FSTPFeedForward(nn.Module):
@@ -280,31 +287,31 @@ def forward(self, x):
         out = self.w3(F.silu(w1_o) * w2_o)
         return out
 
+
 class FSTPAllGatherSyncHandler:
     """
     All-gather handler for overlapping the all-gather in adjcent FSTP linear.
     """
-    
+
     def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
-        
         # import pdb; pdb.set_trace()
         self.process_group = process_group
         self.FSTP_modules = []
         self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
-        self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward
-        self.module_handler = dict() # key: FSTP module; value: all-gather handler
-        self.module_block = dict() # key: FSTP module; value: transformer block index
-        self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module}
-        self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name
-        
+        self.FSTP_global_weights = dict()  # key: FSTP module; value: module global weight for forward
+        self.module_handler = dict()  # key: FSTP module; value: all-gather handler
+        self.module_block = dict()  # key: FSTP module; value: transformer block index
+        self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
+        self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
+
         # just want to share same for loop for ModuleList and Module
         if not isinstance(model, nn.ModuleList):
             model = [model]
-        
+
         for _chunk in model:
             if isinstance(_chunk, NaiveAMPModel):
                 _chunk = _chunk.model
-            
+
             for _, children in _chunk.named_children():
                 if isinstance(children, nn.ModuleList):
                     for idx, block in enumerate(children):
@@ -322,13 +329,12 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                         index = index + 1
                             else:
                                 continue
-        
-    
+
     def _register_sync_parameters_hook(self) -> None:
         """
         register pre_forward_hook and pre_backward_hook for FSTPLinear.
         """
-        
+
         def _pre_forward_hook(module: nn.Module, inputs: Any):
             block_index = self.module_block[module]
             name_index = self.module_name_index[module]
@@ -336,19 +342,23 @@ def _pre_forward_hook(module: nn.Module, inputs: Any):
                 total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
                 weight_handler.wait()
                 self.FSTP_global_weights[module] = total_weight
-                
+
                 # start the all-gather for next module
                 next_module = self.block_module[block_index][name_index + 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                    next_module.weight, self.process_group, async_op=True
+                )
                 self.module_handler[next_module] = weights_handler
             else:
                 handler = self.module_handler[module]
                 handler.wait()
                 if name_index != 4:
                     next_module = self.block_module[block_index][name_index + 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                        next_module.weight, self.process_group, async_op=True
+                    )
                     self.module_handler[next_module] = weights_handler
-        
+
         def _post_forward_hook(module: nn.Module, input, output):
             del self.FSTP_global_weights[module]
             del self.module_handler[module]
@@ -360,22 +370,26 @@ def _pre_backward_hook(module: nn.Module, grad_input, grad_output):
                 total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
                 weight_handler.wait()
                 self.FSTP_global_weights[module] = total_weight
-                
+
                 # start the all-gather for next module
                 next_module = self.block_module[block_index][name_index - 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                    next_module.weight, self.process_group, async_op=True
+                )
                 self.module_handler[next_module] = weights_handler
             else:
                 handler = self.module_handler[module]
                 handler.wait()
                 if name_index != 0:
                     next_module = self.block_module[block_index][name_index - 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True)
+                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                        next_module.weight, self.process_group, async_op=True
+                    )
                     self.module_handler[next_module] = weights_handler
-        
+
         def _post_backward_hook(module, grad_input, grad_output):
             del self.FSTP_global_weights[module]
-        
+
         for module in self.FSTP_modules:
             # import pdb; pdb.set_trace()
             module.register_forward_pre_hook(_pre_forward_hook)
@@ -383,4 +397,145 @@ def _post_backward_hook(module, grad_input, grad_output):
             # module.register_backward_pre_hook(_pre_backward_hook)
             # module.register_backward_hook(_post_backward_hook)
             module.register_module_full_backward_pre_hook(_pre_backward_hook)
-            
\ No newline at end of file
+
+
+class CoarseGrainedFSTPAllGatherSyncHandler:
+    """
+    All-gather handler for overlapping the all-gather in adjcent FSTP block.
+    """
+
+    def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
+        # import pdb; pdb.set_trace()
+        self.process_group = process_group
+        self.FSTP_blocks = []
+        self.FSTP_outs = []
+        self.FSTP_wqkvs = []
+        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
+        self.FSTP_global_handle = dict()  # key: FSTP module; value: module global all-gather op handle
+        self.FSTP_global_weights = dict()  # key: FSTP module; value: module global weight for forward
+        self.block_handles = dict()  # key: transformer block; value: all-gather handles
+        self.module_to_index = dict()  # key: FSTP module; value: transformer block index
+        self.block_to_index = dict()  # key: transformer block; value: transformer block index
+        self.index_to_block = dict()  # key: transformer block index; value: transformer block
+        self.index_to_fsdp_modules = dict()  # key: transformer block index; value: fsdp modules
+
+        # just want to share same for loop for ModuleList and Module
+        if not isinstance(model, nn.ModuleList):
+            model = [model]
+
+        for _chunk in model:
+            if isinstance(_chunk, NaiveAMPModel):
+                _chunk = _chunk.model
+
+            for _, children in _chunk.named_children():
+                if isinstance(children, nn.ModuleList):
+                    for idx, block in enumerate(children):
+                        self.FSTP_blocks.append(block)
+                        self.block_to_index[block] = idx
+                        self.index_to_block[idx] = block
+                        self.index_to_fsdp_modules[idx] = []
+                        for _, sub in block.named_children():
+                            sub_modules = list(sub.children())
+                            if len(sub_modules) > 0:
+                                for name, child in sub.named_children():
+                                    # print(f"name: {name}", flush=True)
+                                    if name == "out_proj":
+                                        self.FSTP_outs.append(child)
+                                        self.module_to_index[child] = idx
+                                    if name == "Wqkv":
+                                        self.FSTP_wqkvs.append(child)
+                                        self.module_to_index[child] = idx
+                                    if isinstance(child, FSTPLinear):
+                                        self.index_to_fsdp_modules[idx].append(child)
+                            else:
+                                continue
+
+    def _all_gather_block_weight(self, block_index: int):
+        block = self.index_to_block[block_index]
+        fsdp_modules = self.index_to_fsdp_modules[block_index]
+        self.block_handles[block] = []
+        for module in fsdp_modules:
+            total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
+            self.FSTP_global_weights[module] = total_weight
+            self.block_handles[block].append(weight_handle)
+
+    def _register_sync_parameters_hook(self) -> None:
+        """
+        register pre_forward_hook and pre_backward_hook for FSTP block.
+
+        Notice that next block's all_gather op should be after current block's all_to_all op, so we
+        1. register pre_forward_hook @out_proj module to prefetch for next block
+        2. register pre_forward_hook @block module to wait handles for next block
+        3. register pre_backward_hook @wqkv module to prefetch for next block
+        4. register pre_backward_hook @block module to wait handles for next block
+        """
+
+        def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
+            block_index = self.module_to_index[module]
+            # start the all-gather for next block
+            if block_index + 1 < gpc.config.NUM_LAYER:
+                self._all_gather_block_weight(block_index + 1)
+
+        def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):
+            block_index = self.block_to_index[block]
+            if block_index == 0:
+                # all gather weight for block 0
+                fsdp_modules = self.index_to_fsdp_modules[block_index]
+                for module in fsdp_modules:
+                    total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
+                    weight_handle.wait()
+                    self.FSTP_global_weights[module] = total_weight
+            else:
+                # wait handle for current block
+                handles = self.block_handles[block]
+                for handle in handles:
+                    handle.wait()
+
+        def _post_forward_hook_for_block(block: nn.Module, input, output):
+            block_index = self.block_to_index[block]
+            fsdp_modules = self.index_to_fsdp_modules[block_index]
+            if block in self.block_handles:
+                del self.block_handles[block]
+            for module in fsdp_modules:
+                del self.FSTP_global_weights[module]
+
+        def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output):
+            block_index = self.module_to_index[module]
+            # start the all-gather for next block
+            if block_index - 1 >= 0:
+                self._all_gather_block_weight(block_index - 1)
+
+        def _pre_backward_hook_for_block(block: nn.Module, grad_output):
+            block_index = self.block_to_index[block]
+            if block_index == gpc.config.NUM_LAYER - 1:
+                # all gather weight for the last block
+                fsdp_modules = self.index_to_fsdp_modules[block_index]
+                for module in fsdp_modules:
+                    total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
+                    weight_handle.wait()
+                    self.FSTP_global_weights[module] = total_weight
+            else:
+                # wait handle for current block
+                handles = self.block_handles[block]
+                for handle in handles:
+                    handle.wait()
+
+        def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
+            block_index = self.block_to_index[block]
+            fsdp_modules = self.index_to_fsdp_modules[block_index]
+            if block in self.block_handles:
+                del self.block_handles[block]
+            for module in fsdp_modules:
+                del self.FSTP_global_weights[module]
+
+        for block in self.FSTP_blocks:
+            block.register_forward_pre_hook(_pre_forward_hook_for_block)
+            block.register_forward_hook(_post_forward_hook_for_block)
+            block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
+            block.register_full_backward_hook(_post_backward_hook_for_block)
+
+        for out_proj in self.FSTP_outs:
+            out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
+
+        for wqkv in self.FSTP_wqkvs:
+            wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv)
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 50b9bbd7..97319d98 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -284,7 +284,6 @@ class FSTPFusedDenseFunc(torch.autograd.Function):
     @staticmethod
     @custom_fwd
     def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None):
-
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
         ctx.process_group = process_group
@@ -297,16 +296,18 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod
 
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
-            total_weight = all_gather_handler.FSTP_global_weights[module]
-            total_bias = bias
-            # # do all_gather for weight and bias before actual computation
-            # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-            # if bias is not None:
-            #     total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
-            #     handle_bias.wait()
-            # else:
-            #     total_bias = bias
-            # handle_weight.wait()
+            # do all_gather for weight and bias before actual computation
+            if module in all_gather_handler.FSTP_global_weights:
+                total_weight = all_gather_handler.FSTP_global_weights[module]
+            else:
+                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+                handle_weight.wait()
+
+            if bias is not None:
+                total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
+                handle_bias.wait()
+            else:
+                total_bias = bias
         else:
             total_weight = weight
             total_bias = bias
@@ -351,12 +352,14 @@ def backward(ctx, grad_output, *args):
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             # do all-gather for weight before backward
-            # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-            # handle_weight.wait()
-            total_weight = all_gather_handler.FSTP_global_weights[module]
+            if module in all_gather_handler.FSTP_global_weights:
+                total_weight = all_gather_handler.FSTP_global_weights[module]
+            else:
+                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+                handle_weight.wait()
         else:
             total_weight = weight
-        
+
         # compute weight grad
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
@@ -380,7 +383,7 @@ def backward(ctx, grad_output, *args):
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
         else:
             grad_input = None
-        
+
         if ctx.needs_input_grad[1]:
             if world_size > 1:
                 handle_grad_weight.wait()
@@ -408,7 +411,13 @@ def fused_dense_func_torch(
 
 
 def fstp_fused_dense_func(
-    x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None, module=None, handler=None
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    return_residual: bool = False,
+    process_group=None,
+    module=None,
+    handler=None,
 ):
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
@@ -460,5 +469,3 @@ def Silu(w1_o, w2_o):
 
 
 Silu = torch.jit.script(Silu)
-
-
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 5deb0233..da59803c 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -36,10 +36,11 @@
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
+    CoarseGrainedFSTPAllGatherSyncHandler,
     FeedForward,
+    FSTPAllGatherSyncHandler,
     RewardModelLinear,
     ScaleColumnParallelLinear,
-    FSTPAllGatherSyncHandler,
 )
 from internlm.model.multi_head_attention import MHA
 from internlm.model.utils import try_import_RMSNorm
@@ -107,13 +108,14 @@ def initialize_model():
 
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
-    
+
     if gpc.config.parallel["tensor"]["mode"] == "fstp":
-        handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
+        handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         handler._register_sync_parameters_hook()
         gpc.config.fstp_handler = handler
     return model
 
+
 def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
     if gpc.config.parallel.zero1.fsdp:
         # set wrap_policy for fsdp wrap

From d0f0c22cace187e62890aa34c3a0595115ceb394 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 13 Oct 2023 11:10:23 +0800
Subject: [PATCH 021/153] feat(model/linear.py): change pre backward from wqkv
 to block

---
 internlm/model/linear.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 3e37863d..56929eea 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -520,6 +520,10 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output):
                 for handle in handles:
                     handle.wait()
 
+            # start the all-gather for next block
+            if block_index - 1 >= 0:
+                self._all_gather_block_weight(block_index - 1)
+
         def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
             block_index = self.block_to_index[block]
             fsdp_modules = self.index_to_fsdp_modules[block_index]
@@ -537,5 +541,5 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
         for out_proj in self.FSTP_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
 
-        for wqkv in self.FSTP_wqkvs:
-            wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv)
+        # for wqkv in self.FSTP_wqkvs:
+        #     wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv)

From 82204eea59862b01c5aca68cad26c5060b1b7b16 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 16 Oct 2023 16:35:14 +0800
Subject: [PATCH 022/153] support hybrid overlap

---
 configs/7B_sft.py                   |  4 +-
 internlm/model/linear.py            | 82 +++++++++++++++++++++++++----
 internlm/train/training_internlm.py |  3 +-
 3 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 814966b1..98bceeb4 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -2,10 +2,10 @@
 DO_ALERT = False
 
 SEQ_LEN = 4096
-HIDDEN_SIZE = 4096
+HIDDEN_SIZE = 8192
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 32
+NUM_LAYER = 8
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 56929eea..890f1cb0 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -360,10 +360,12 @@ def _pre_forward_hook(module: nn.Module, inputs: Any):
                     self.module_handler[next_module] = weights_handler
 
         def _post_forward_hook(module: nn.Module, input, output):
-            del self.FSTP_global_weights[module]
-            del self.module_handler[module]
+            if module in self.FSTP_global_weights:
+                del self.FSTP_global_weights[module]
+            if module in self.module_handler:
+                del self.module_handler[module]
 
-        def _pre_backward_hook(module: nn.Module, grad_input, grad_output):
+        def _pre_backward_hook(module: nn.Module, grad_output):
             block_index = self.module_block[module]
             name_index = self.module_name_index[module]
             if name_index == 4:
@@ -396,7 +398,8 @@ def _post_backward_hook(module, grad_input, grad_output):
             module.register_forward_hook(_post_forward_hook)
             # module.register_backward_pre_hook(_pre_backward_hook)
             # module.register_backward_hook(_post_backward_hook)
-            module.register_module_full_backward_pre_hook(_pre_backward_hook)
+            module.register_full_backward_pre_hook(_pre_backward_hook)
+            module.register_full_backward_hook(_post_backward_hook)
 
 
 class CoarseGrainedFSTPAllGatherSyncHandler:
@@ -410,6 +413,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.FSTP_blocks = []
         self.FSTP_outs = []
         self.FSTP_wqkvs = []
+        self.FSTP_modules = []
         self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
         self.FSTP_global_handle = dict()  # key: FSTP module; value: module global all-gather op handle
         self.FSTP_global_weights = dict()  # key: FSTP module; value: module global weight for forward
@@ -418,6 +422,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.block_to_index = dict()  # key: transformer block; value: transformer block index
         self.index_to_block = dict()  # key: transformer block index; value: transformer block
         self.index_to_fsdp_modules = dict()  # key: transformer block index; value: fsdp modules
+        self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
+        self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
 
         # just want to share same for loop for ModuleList and Module
         if not isinstance(model, nn.ModuleList):
@@ -430,6 +436,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
             for _, children in _chunk.named_children():
                 if isinstance(children, nn.ModuleList):
                     for idx, block in enumerate(children):
+                        index = 0
+                        self.block_module[idx] = {}
                         self.FSTP_blocks.append(block)
                         self.block_to_index[block] = idx
                         self.index_to_block[idx] = block
@@ -441,12 +449,17 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                     # print(f"name: {name}", flush=True)
                                     if name == "out_proj":
                                         self.FSTP_outs.append(child)
-                                        self.module_to_index[child] = idx
+                                        # self.module_to_index[child] = idx
                                     if name == "Wqkv":
                                         self.FSTP_wqkvs.append(child)
-                                        self.module_to_index[child] = idx
+                                        # self.module_to_index[child] = idx
                                     if isinstance(child, FSTPLinear):
+                                        self.module_to_index[child] = idx
+                                        self.block_module[idx][index] = child
+                                        self.FSTP_modules.append(child)
                                         self.index_to_fsdp_modules[idx].append(child)
+                                        self.module_name_index[child] = index
+                                        index = index + 1
                             else:
                                 continue
 
@@ -457,6 +470,7 @@ def _all_gather_block_weight(self, block_index: int):
         for module in fsdp_modules:
             total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
             self.FSTP_global_weights[module] = total_weight
+            self.FSTP_global_handle[module] = weight_handle
             self.block_handles[block].append(weight_handle)
 
     def _register_sync_parameters_hook(self) -> None:
@@ -498,6 +512,19 @@ def _post_forward_hook_for_block(block: nn.Module, input, output):
                 del self.block_handles[block]
             for module in fsdp_modules:
                 del self.FSTP_global_weights[module]
+        
+        
+        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
+            block_index = self.module_to_index[module]
+            if block_index != 0:
+                handler = self.FSTP_global_handle[module]
+                handler.wait()
+        
+        def _post_forward_hook_for_module(module: nn.Module, input, output):
+            if module in self.FSTP_global_weights:
+                del self.FSTP_global_weights[module]
+            if module in self.FSTP_global_handle:
+                del self.FSTP_global_handle[module]
 
         def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output):
             block_index = self.module_to_index[module]
@@ -531,15 +558,48 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
                 del self.block_handles[block]
             for module in fsdp_modules:
                 del self.FSTP_global_weights[module]
+        
+        def _pre_backward_hook_for_module(module: nn.Module, grad_output):
+            block_index = self.module_to_index[module]
+            name_index = self.module_name_index[module]
+            if name_index == 4:
+                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                weight_handler.wait()
+                self.FSTP_global_weights[module] = total_weight
 
-        for block in self.FSTP_blocks:
-            block.register_forward_pre_hook(_pre_forward_hook_for_block)
-            block.register_forward_hook(_post_forward_hook_for_block)
-            block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
-            block.register_full_backward_hook(_post_backward_hook_for_block)
+                # start the all-gather for next module
+                next_module = self.block_module[block_index][name_index - 1]
+                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                    next_module.weight, self.process_group, async_op=True
+                )
+                self.FSTP_global_handle[next_module] = weights_handler
+            else:
+                handler = self.FSTP_global_handle[module]
+                handler.wait()
+                if name_index != 0:
+                    next_module = self.block_module[block_index][name_index - 1]
+                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                        next_module.weight, self.process_group, async_op=True
+                    )
+                    self.FSTP_global_handle[next_module] = weights_handler
+
+        def _post_backward_hook_for_module(module, grad_input, grad_output):
+            del self.FSTP_global_weights[module]
+
+        # for block in self.FSTP_blocks:
+            # block.register_forward_pre_hook(_pre_forward_hook_for_block)
+            # block.register_forward_hook(_post_forward_hook_for_block)
+            # block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
+            # block.register_full_backward_hook(_post_backward_hook_for_block)
 
         for out_proj in self.FSTP_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
 
         # for wqkv in self.FSTP_wqkvs:
         #     wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv)
+        
+        for module in self.FSTP_modules:
+            module.register_forward_pre_hook(_pre_forward_hook_for_module)
+            module.register_forward_hook(_post_forward_hook_for_module)
+            module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
+            module.register_full_backward_hook(_post_backward_hook_for_module)
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index da59803c..572adbad 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -110,7 +110,8 @@ def initialize_model():
     model = wrap_FSDP_model(model)
 
     if gpc.config.parallel["tensor"]["mode"] == "fstp":
-        handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
+        # handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
+        handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         handler._register_sync_parameters_hook()
         gpc.config.fstp_handler = handler
     return model

From 0d1fa037ddd3c899e3c42fbb9c013b17c4dd03dc Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 16 Oct 2023 20:13:59 +0800
Subject: [PATCH 023/153] feat(model/linear.py): set block 0 full weight

---
 internlm/model/linear.py               | 133 +++++++++++++++----------
 internlm/model/modeling_internlm.py    |   6 +-
 internlm/model/multi_head_attention.py |  53 ++++++----
 internlm/train/training_internlm.py    |  13 ++-
 4 files changed, 131 insertions(+), 74 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 890f1cb0..8a17c719 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -175,6 +175,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
+        block_idx: int = 0,
     ):
         super().__init__()
 
@@ -248,38 +249,62 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
+        block_idx: int = 0,
     ):
         super().__init__()
 
         hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
 
-        self.w1 = FSTPLinear(
-            in_features,
-            hidden_features,
-            process_group,
-            bias,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            device=device,
-            dtype=dtype,
-        )
-        self.w2 = FSTPLinear(
-            in_features,
-            hidden_features,
-            process_group,
-            bias,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            device=device,
-            dtype=dtype,
-        )
-        self.w3 = FSTPLinear(
-            hidden_features,
-            out_features,
-            process_group,
-            bias=bias,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            device=device,
-            dtype=dtype,
-        )
+        if block_idx == 0 and gpc.config.parallel.block_0_full_weight:
+            self.w1 = nn.Linear(
+                in_features,
+                hidden_features,
+                bias,
+                device=device,
+                dtype=dtype,
+            )
+            self.w2 = nn.Linear(
+                in_features,
+                hidden_features,
+                bias,
+                device=device,
+                dtype=dtype,
+            )
+            self.w3 = nn.Linear(
+                hidden_features,
+                out_features,
+                bias=bias,
+                device=device,
+                dtype=dtype,
+            )
+        else:
+            self.w1 = FSTPLinear(
+                in_features,
+                hidden_features,
+                process_group,
+                bias,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
+                device=device,
+                dtype=dtype,
+            )
+            self.w2 = FSTPLinear(
+                in_features,
+                hidden_features,
+                process_group,
+                bias,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
+                device=device,
+                dtype=dtype,
+            )
+            self.w3 = FSTPLinear(
+                hidden_features,
+                out_features,
+                process_group,
+                bias=bias,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
+                device=device,
+                dtype=dtype,
+            )
 
     def forward(self, x):
         w1_o = self.w1(x)
@@ -449,10 +474,10 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                     # print(f"name: {name}", flush=True)
                                     if name == "out_proj":
                                         self.FSTP_outs.append(child)
-                                        # self.module_to_index[child] = idx
+                                        self.module_to_index[child] = idx
                                     if name == "Wqkv":
                                         self.FSTP_wqkvs.append(child)
-                                        # self.module_to_index[child] = idx
+                                        self.module_to_index[child] = idx
                                     if isinstance(child, FSTPLinear):
                                         self.module_to_index[child] = idx
                                         self.block_module[idx][index] = child
@@ -489,6 +514,7 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
             # start the all-gather for next block
             if block_index + 1 < gpc.config.NUM_LAYER:
                 self._all_gather_block_weight(block_index + 1)
+                # print(f"_all_gather_block_weight for block {block_index+1}", flush=True)
 
         def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):
             block_index = self.block_to_index[block]
@@ -512,14 +538,13 @@ def _post_forward_hook_for_block(block: nn.Module, input, output):
                 del self.block_handles[block]
             for module in fsdp_modules:
                 del self.FSTP_global_weights[module]
-        
-        
+
         def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
             block_index = self.module_to_index[module]
             if block_index != 0:
                 handler = self.FSTP_global_handle[module]
                 handler.wait()
-        
+
         def _post_forward_hook_for_module(module: nn.Module, input, output):
             if module in self.FSTP_global_weights:
                 del self.FSTP_global_weights[module]
@@ -558,46 +583,48 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
                 del self.block_handles[block]
             for module in fsdp_modules:
                 del self.FSTP_global_weights[module]
-        
+
         def _pre_backward_hook_for_module(module: nn.Module, grad_output):
             block_index = self.module_to_index[module]
             name_index = self.module_name_index[module]
-            if name_index == 4:
-                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                weight_handler.wait()
-                self.FSTP_global_weights[module] = total_weight
+            if block_index != 0:
+                if name_index == 4:
+                    total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                    weight_handler.wait()
+                    self.FSTP_global_weights[module] = total_weight
 
-                # start the all-gather for next module
-                next_module = self.block_module[block_index][name_index - 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                    next_module.weight, self.process_group, async_op=True
-                )
-                self.FSTP_global_handle[next_module] = weights_handler
-            else:
-                handler = self.FSTP_global_handle[module]
-                handler.wait()
-                if name_index != 0:
+                    # start the all-gather for next module
                     next_module = self.block_module[block_index][name_index - 1]
                     self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
                         next_module.weight, self.process_group, async_op=True
                     )
                     self.FSTP_global_handle[next_module] = weights_handler
+                else:
+                    handler = self.FSTP_global_handle[module]
+                    handler.wait()
+                    if name_index != 0:
+                        next_module = self.block_module[block_index][name_index - 1]
+                        self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                            next_module.weight, self.process_group, async_op=True
+                        )
+                        self.FSTP_global_handle[next_module] = weights_handler
 
         def _post_backward_hook_for_module(module, grad_input, grad_output):
-            del self.FSTP_global_weights[module]
+            if module in self.FSTP_global_weights:
+                del self.FSTP_global_weights[module]
 
         # for block in self.FSTP_blocks:
-            # block.register_forward_pre_hook(_pre_forward_hook_for_block)
-            # block.register_forward_hook(_post_forward_hook_for_block)
-            # block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
-            # block.register_full_backward_hook(_post_backward_hook_for_block)
+        # block.register_forward_pre_hook(_pre_forward_hook_for_block)
+        # block.register_forward_hook(_post_forward_hook_for_block)
+        # block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
+        # block.register_full_backward_hook(_post_backward_hook_for_block)
 
         for out_proj in self.FSTP_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
 
         # for wqkv in self.FSTP_wqkvs:
         #     wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv)
-        
+
         for module in self.FSTP_modules:
             module.register_forward_pre_hook(_pre_forward_hook_for_module)
             module.register_forward_hook(_post_forward_hook_for_module)
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 228dbd34..cb933960 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -78,6 +78,7 @@ def __init__(
         use_swiglu: bool = True,
         use_flash_attn: bool = True,
         tp_mode: str = "origin_tp",
+        block_idx: int = 0,
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -103,6 +104,7 @@ def __init__(
             device=device,
             dtype=dtype,
             tp_mode=tp_mode,
+            block_idx=block_idx,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -123,6 +125,7 @@ def __init__(
                 bias=False,
                 device=device,
                 dtype=dtype,
+                block_idx=block_idx,
             )
         else:
             self.mlp = ParallelFusedMLP(
@@ -344,6 +347,7 @@ def __init__(
                     use_swiglu=use_swiglu,
                     use_flash_attn=use_flash_attn,
                     tp_mode=self.tp_mode,
+                    block_idx=lid,
                 )
                 for lid in range(num_layers)
             ]
@@ -410,7 +414,7 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             # Evaluation
             if hidden_states.ndim == 3:
                 hidden_states = self.head(hidden_states, gather_dim=1)
-            else: # Training
+            else:  # Training
                 hidden_states = self.head(hidden_states, gather_dim=0)
 
         if not self.parallel_output:
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 1db98d7e..6c1e7d89 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -51,7 +51,6 @@ class _SeqAllToAll(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx: Any, group: dist.ProcessGroup, input_: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
-
         ctx.group = group
         ctx.scatter_idx = scatter_idx
         ctx.gather_idx = gather_idx
@@ -91,7 +90,6 @@ def __init__(
         second_scatter_idx: int = 0,
         second_gather_idx: int = 1,
     ) -> None:
-
         super().__init__()
         self.local_attn = local_attention
         self.spg = sequence_process_group
@@ -178,6 +176,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         tp_mode: str = "origin_tp",
+        block_idx: int = 0,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
@@ -206,14 +205,23 @@ def __init__(
 
         # notice here should change bias=True
         Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
-        self.Wqkv = Wqkv_cls(
-            embed_dim,
-            3 * embed_dim,
-            process_group,
-            bias=False,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            **factory_kwargs,
-        )  # according to https://spaces.ac.cn/archives/9577
+        if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight:
+            Wqkv_cls = nn.Linear
+            self.Wqkv = Wqkv_cls(
+                embed_dim,
+                3 * embed_dim,
+                bias=False,
+                **factory_kwargs,
+            )
+        else:
+            self.Wqkv = Wqkv_cls(
+                embed_dim,
+                3 * embed_dim,
+                process_group,
+                bias=False,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
+                **factory_kwargs,
+            )  # according to https://spaces.ac.cn/archives/9577
 
         inner_attn_cls = FlashSelfAttention if use_flash_attn else SelfAttention
         inner_cross_attn_cls = FlashCrossAttention if use_flash_attn else CrossAttention
@@ -227,14 +235,23 @@ def __init__(
 
         # output projection always have the bias (for now)
         out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
-        self.out_proj = out_proj_cls(
-            embed_dim,
-            embed_dim,
-            process_group,
-            bias=False,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            **factory_kwargs,
-        )
+        if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight:
+            out_proj_cls = nn.Linear
+            self.out_proj = out_proj_cls(
+                embed_dim,
+                embed_dim,
+                bias=False,
+                **factory_kwargs,
+            )
+        else:
+            self.out_proj = out_proj_cls(
+                embed_dim,
+                embed_dim,
+                process_group,
+                bias=False,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
+                **factory_kwargs,
+            )
         # need to assign tp attribute so that internlm know it is tensor parallel module
         if gpc.get_world_size(ParallelMode.TENSOR) > 1:
             for name in ["out_proj", "Wqkv"]:
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 572adbad..24040a02 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -110,8 +110,8 @@ def initialize_model():
     model = wrap_FSDP_model(model)
 
     if gpc.config.parallel["tensor"]["mode"] == "fstp":
-        # handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
-        handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
+        handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
+        # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         handler._register_sync_parameters_hook()
         gpc.config.fstp_handler = handler
     return model
@@ -396,6 +396,9 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
     )
 
 
+tgs_list = []
+
+
 @llm_timeout(func_name="record_current_batch_training_metrics")
 def record_current_batch_training_metrics(
     get_tflops_func,
@@ -568,3 +571,9 @@ def record_current_batch_training_metrics(
             step_count=batch_count,
             cur_step_loss=loss.item(),
         )
+
+        if batch_count >= 5:
+            tgs_list.append(tgs_origin)
+        if batch_count == gpc.config.data.total_steps - 1:
+            print(tgs_list, flush=True)
+            print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True)

From d1af0d6aee32a71385ef89983aef9ebb2417752c Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 17 Oct 2023 10:13:56 +0800
Subject: [PATCH 024/153] feat(model/linear.py): block-grained backward

---
 configs/7B_sft.py        |  9 ++---
 internlm/model/linear.py | 77 +++++++++++++++++++++-------------------
 2 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 98bceeb4..36f9ac14 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -5,7 +5,7 @@
 HIDDEN_SIZE = 8192
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 8
+NUM_LAYER = 4
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@@ -57,7 +57,7 @@
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=False,
-    total_steps=50000,
+    total_steps=20,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
@@ -161,10 +161,11 @@
 sequence parallel (bool): enable/disable sequence parallel, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=1, fsdp=False),
-    tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, mode="fstp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
+    block_0_full_weight=True,
 )
 
 cudnn_deterministic = False
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 8a17c719..8e19ab69 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -559,21 +559,21 @@ def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output):
 
         def _pre_backward_hook_for_block(block: nn.Module, grad_output):
             block_index = self.block_to_index[block]
-            if block_index == gpc.config.NUM_LAYER - 1:
-                # all gather weight for the last block
-                fsdp_modules = self.index_to_fsdp_modules[block_index]
-                for module in fsdp_modules:
-                    total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
-                    weight_handle.wait()
-                    self.FSTP_global_weights[module] = total_weight
-            else:
-                # wait handle for current block
-                handles = self.block_handles[block]
-                for handle in handles:
-                    handle.wait()
+            # if block_index == gpc.config.NUM_LAYER - 1:
+            #     # all gather weight for the last block
+            #     fsdp_modules = self.index_to_fsdp_modules[block_index]
+            #     for module in fsdp_modules:
+            #         total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
+            #         weight_handle.wait()
+            #         self.FSTP_global_weights[module] = total_weight
+            # else:
+            #     # wait handle for current block
+            #     handles = self.block_handles[block]
+            #     for handle in handles:
+            #         handle.wait()
 
             # start the all-gather for next block
-            if block_index - 1 >= 0:
+            if block_index - 1 > 0:
                 self._all_gather_block_weight(block_index - 1)
 
         def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
@@ -588,36 +588,41 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output):
             block_index = self.module_to_index[module]
             name_index = self.module_name_index[module]
             if block_index != 0:
-                if name_index == 4:
-                    total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                    weight_handler.wait()
-                    self.FSTP_global_weights[module] = total_weight
-
-                    # start the all-gather for next module
-                    next_module = self.block_module[block_index][name_index - 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                        next_module.weight, self.process_group, async_op=True
-                    )
-                    self.FSTP_global_handle[next_module] = weights_handler
-                else:
+                # if name_index == 4:
+                #     total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                #     weight_handler.wait()
+                #     self.FSTP_global_weights[module] = total_weight
+
+                #     # start the all-gather for next module
+                #     next_module = self.block_module[block_index][name_index - 1]
+                #     self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                #         next_module.weight, self.process_group, async_op=True
+                #     )
+                #     self.FSTP_global_handle[next_module] = weights_handler
+                # else:
+                #     handler = self.FSTP_global_handle[module]
+                #     handler.wait()
+                #     if name_index != 0:
+                #         next_module = self.block_module[block_index][name_index - 1]
+                #         self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                #             next_module.weight, self.process_group, async_op=True
+                #         )
+                #         self.FSTP_global_handle[next_module] = weights_handler
+                if module in self.FSTP_global_handle:
                     handler = self.FSTP_global_handle[module]
                     handler.wait()
-                    if name_index != 0:
-                        next_module = self.block_module[block_index][name_index - 1]
-                        self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                            next_module.weight, self.process_group, async_op=True
-                        )
-                        self.FSTP_global_handle[next_module] = weights_handler
 
         def _post_backward_hook_for_module(module, grad_input, grad_output):
             if module in self.FSTP_global_weights:
                 del self.FSTP_global_weights[module]
+            if module in self.FSTP_global_handle:
+                del self.FSTP_global_handle[module]
 
-        # for block in self.FSTP_blocks:
-        # block.register_forward_pre_hook(_pre_forward_hook_for_block)
-        # block.register_forward_hook(_post_forward_hook_for_block)
-        # block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
-        # block.register_full_backward_hook(_post_backward_hook_for_block)
+        for block in self.FSTP_blocks:
+            # block.register_forward_pre_hook(_pre_forward_hook_for_block)
+            # block.register_forward_hook(_post_forward_hook_for_block)
+            block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
+            # block.register_full_backward_hook(_post_backward_hook_for_block)
 
         for out_proj in self.FSTP_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)

From 229cc5c68c518734edfc01d36e6bd616d32a7224 Mon Sep 17 00:00:00 2001
From: "chenxun.p" <chenxun@senstime.com>
Date: Tue, 17 Oct 2023 11:15:54 +0800
Subject: [PATCH 025/153] impl reduce scatter async

---
 .../core/scheduler/no_pipeline_scheduler.py   |  1 +
 internlm/model/linear.py                      | 23 +++++++++--
 internlm/model/utils.py                       | 28 ++++++++------
 .../solver/optimizer/hybrid_zero_optim.py     | 38 +++++++++++++++++--
 4 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index 56661d8c..f0caf05c 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -194,6 +194,7 @@ def forward_backward_step(
             _output, _loss, _moe_loss = self._train_one_batch(
                 _data, _label, engine, forward_only, return_loss, self._grad_accum_size
             )
+            engine.optimizer.reset_reduce_bucket()
 
             if return_loss:
                 loss += _loss
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 8e19ab69..b141829e 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -329,6 +329,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
         self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
 
+        self.reduce_scatter_handlers = {}
+
         # just want to share same for loop for ModuleList and Module
         if not isinstance(model, nn.ModuleList):
             model = [model]
@@ -337,16 +339,22 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
             if isinstance(_chunk, NaiveAMPModel):
                 _chunk = _chunk.model
 
-            for _, children in _chunk.named_children():
+            for _chunk_name, children in _chunk.named_children():
                 if isinstance(children, nn.ModuleList):
                     for idx, block in enumerate(children):
                         index = 0
                         self.block_module[idx] = {}
-                        for _, sub in block.named_children():
+                        for _sub_name, sub in block.named_children():
                             sub_modules = list(sub.children())
                             if len(sub_modules) > 0:
                                 for name, child in sub.named_children():
                                     if isinstance(child, FSTPLinear):
+
+                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
+                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
+                                        if child.bias is not None:
+                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
+
                                         self.FSTP_modules.append(child)
                                         self.module_block[child] = idx
                                         self.block_module[idx][index] = child
@@ -450,6 +458,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
         self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
 
+        self.reduce_scatter_handlers = {}
+
         # just want to share same for loop for ModuleList and Module
         if not isinstance(model, nn.ModuleList):
             model = [model]
@@ -458,7 +468,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
             if isinstance(_chunk, NaiveAMPModel):
                 _chunk = _chunk.model
 
-            for _, children in _chunk.named_children():
+            for _chunk_name, children in _chunk.named_children():
                 if isinstance(children, nn.ModuleList):
                     for idx, block in enumerate(children):
                         index = 0
@@ -467,7 +477,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                         self.block_to_index[block] = idx
                         self.index_to_block[idx] = block
                         self.index_to_fsdp_modules[idx] = []
-                        for _, sub in block.named_children():
+                        for _sub_name, sub in block.named_children():
                             sub_modules = list(sub.children())
                             if len(sub_modules) > 0:
                                 for name, child in sub.named_children():
@@ -485,6 +495,11 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                         self.index_to_fsdp_modules[idx].append(child)
                                         self.module_name_index[child] = index
                                         index = index + 1
+                                        
+                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
+                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
+                                        if child.bias is not None:
+                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
                             else:
                                 continue
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 97319d98..78ad456d 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -324,9 +324,9 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod
             raise RuntimeError("fused_dense only supports matrix dims <= 2M")
         output = F.linear(total_x, total_weight, total_bias)
         if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
+            ctx.save_for_backward(x, weight, bias)
         else:
-            ctx.save_for_backward(weight)
+            ctx.save_for_backward(weight, bias)
         return output if not return_residual else (output, x)
 
     @staticmethod
@@ -340,10 +340,10 @@ def backward(ctx, grad_output, *args):
         all_gather_handler = ctx.all_gather_handler
         module = ctx.module
         if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
+            x, weight, bias = ctx.saved_tensors
             total_x = x
         else:
-            (weight,) = ctx.saved_tensors
+            weight, bias = ctx.saved_tensors
             total_x = None
         batch_shape = grad_output.shape[:-1]
         batch_dim = batch_shape.numel()
@@ -368,9 +368,15 @@ def backward(ctx, grad_output, *args):
                 total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
             )
             if world_size > 1:
-                grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                assert hasattr(weight, "_fstp_reduce_scatter_str")
+                all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
+                grad_weight = torch.empty(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device)
                 if grad_bias is not None:
-                    grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                    grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                    assert hasattr(bias, "_fstp_reduce_scatter_str")
+                    all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
+                    grad_bias = torch.empty(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device)
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
@@ -384,11 +390,11 @@ def backward(ctx, grad_output, *args):
         else:
             grad_input = None
 
-        if ctx.needs_input_grad[1]:
-            if world_size > 1:
-                handle_grad_weight.wait()
-                if grad_bias is not None:
-                    handle_grad_bias.wait()
+        # if ctx.needs_input_grad[1]:
+        #     if world_size > 1:
+        #         handle_grad_weight.wait()
+        #         if grad_bias is not None:
+        #             handle_grad_bias.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 97004eb9..c6e9aaba 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -65,6 +65,8 @@ def __init__(
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
 
+        self._fstp_handler = gpc.config.fstp_handler
+
         # Zero related args
         reduce_bucket_size = zero_cfg.reduce_bucket_size
         clip_grad_norm = zero_cfg.clip_grad_norm
@@ -301,8 +303,7 @@ def _define_and_attach(param, reduce_rank=None):
                         # NOT IMPORTANT BUT GOOD TO KNOW:
                         # args here is not grad, but allow_unreacable and accumulate_grad
                         def reduce_grad_hook(*args):  # pylint: disable=W0613
-                            if self.skip_grad_reduce is False:
-                                reduction_func()
+                            reduction_func()
 
                         accum_grad_obj.register_hook(reduce_grad_hook)
 
@@ -322,6 +323,20 @@ def belongs_to_current_rank(self, param) -> bool:
         group_id = getattr(param, "group_id")
         return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id])
 
+    def reset_reduce_bucket(self) -> None:
+        for bucket in self._bucket_store:
+            for rank, params in bucket._params.items():
+                for _param in params:
+                    if not hasattr(_param, "_fstp_reduce_scatter_str"):
+                        continue
+
+                    key = getattr(_param, "_fstp_reduce_scatter_str")
+                    comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
+                    comm_handle.wait()
+                    _param.grad = _grad
+
+                bucket.reset_by_rank(rank)
+
     def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
         param_size = param.numel()
 
@@ -332,11 +347,26 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
         current_bucket = self._bucket_store[group_id]
 
         if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
-            self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False)
+            # wait reduce scatter communication
+            params = current_bucket.get_param(reduce_rank)
+            for _param in params:
+                if not hasattr(_param, "_fstp_reduce_scatter_str"):
+                    continue
+
+                key = getattr(_param, "_fstp_reduce_scatter_str")
+                comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
+                comm_handle.wait()
+                _param.grad = _grad
+
+            # reduce grad
+            if self.skip_grad_reduce is False:
+                self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False)
+            else:
+                current_bucket.reset_by_rank(reduce_rank)
 
         # the param must not be reduced to ensure correctness
         is_param_reduced = self._param_store.is_param_reduced(param)
-        if is_param_reduced:
+        if is_param_reduced and self.skip_grad_reduce is False:
             msg = (
                 f"Parameter of size ({param.size()}) has already been reduced, "
                 + "duplicate reduction will lead to arithmetic incorrectness"

From 4e99a7fdbc88e398255d63a9b22854b5ded5deb3 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 17 Oct 2023 11:30:44 +0800
Subject: [PATCH 026/153] feat(train/training_internlm.py): remove abnormal tgs
 when calculating avg tgs

---
 internlm/train/training_internlm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 24040a02..cc310a21 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -576,4 +576,8 @@ def record_current_batch_training_metrics(
             tgs_list.append(tgs_origin)
         if batch_count == gpc.config.data.total_steps - 1:
             print(tgs_list, flush=True)
+            avg_tgs = sum(tgs_list) / len(tgs_list)
+            for tgs in tgs_list.copy():
+                if abs(tgs - avg_tgs) > 1000:
+                    tgs_list.remove(tgs)
             print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True)

From 6682f5d92a02111777f5c1fbc8c0765c9770ffa2 Mon Sep 17 00:00:00 2001
From: "chenxun.p" <chenxun@senstime.com>
Date: Tue, 17 Oct 2023 15:10:07 +0800
Subject: [PATCH 027/153] fix reduce scatter async bug

---
 internlm/model/utils.py                        | 4 ++--
 internlm/solver/optimizer/hybrid_zero_optim.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 78ad456d..0194e84a 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -371,12 +371,12 @@ def backward(ctx, grad_output, *args):
                 grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
                 assert hasattr(weight, "_fstp_reduce_scatter_str")
                 all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
-                grad_weight = torch.empty(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device)
+                grad_weight = torch.zeros(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device)
                 if grad_bias is not None:
                     grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
                     assert hasattr(bias, "_fstp_reduce_scatter_str")
                     all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
-                    grad_bias = torch.empty(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device)
+                    grad_bias = torch.zeros(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device)
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index c6e9aaba..950d35e8 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -333,7 +333,7 @@ def reset_reduce_bucket(self) -> None:
                     key = getattr(_param, "_fstp_reduce_scatter_str")
                     comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
                     comm_handle.wait()
-                    _param.grad = _grad
+                    _param.grad += _grad
 
                 bucket.reset_by_rank(rank)
 
@@ -356,7 +356,7 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
                 key = getattr(_param, "_fstp_reduce_scatter_str")
                 comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
                 comm_handle.wait()
-                _param.grad = _grad
+                _param.grad += _grad
 
             # reduce grad
             if self.skip_grad_reduce is False:

From 6408b944c2e6510253f3b5ca7e3680ed56a6b528 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 17 Oct 2023 15:14:39 +0800
Subject: [PATCH 028/153] support fine grained

---
 internlm/model/linear.py | 77 +++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 28 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 8e19ab69..e8727ac5 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -449,7 +449,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.index_to_fsdp_modules = dict()  # key: transformer block index; value: fsdp modules
         self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
         self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
-
+        self.head = []
+        
         # just want to share same for loop for ModuleList and Module
         if not isinstance(model, nn.ModuleList):
             model = [model]
@@ -487,16 +488,18 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                         index = index + 1
                             else:
                                 continue
+                elif isinstance(children, ScaleColumnParallelLinear):
+                    self.head.append(children)
 
     def _all_gather_block_weight(self, block_index: int):
         block = self.index_to_block[block_index]
         fsdp_modules = self.index_to_fsdp_modules[block_index]
-        self.block_handles[block] = []
+        # self.block_handles[block] = []
         for module in fsdp_modules:
             total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
             self.FSTP_global_weights[module] = total_weight
             self.FSTP_global_handle[module] = weight_handle
-            self.block_handles[block].append(weight_handle)
+            # self.block_handles[block].append(weight_handle)
 
     def _register_sync_parameters_hook(self) -> None:
         """
@@ -558,6 +561,7 @@ def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output):
                 self._all_gather_block_weight(block_index - 1)
 
         def _pre_backward_hook_for_block(block: nn.Module, grad_output):
+            # import pdb; pdb.set_trace()
             block_index = self.block_to_index[block]
             # if block_index == gpc.config.NUM_LAYER - 1:
             #     # all gather weight for the last block
@@ -571,10 +575,14 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output):
             #     handles = self.block_handles[block]
             #     for handle in handles:
             #         handle.wait()
-
+            # if block_index == gpc.config.NUM_LAYER - 1:
+            #     self._all_gather_block_weight(block_index)
             # start the all-gather for next block
             if block_index - 1 > 0:
                 self._all_gather_block_weight(block_index - 1)
+        
+        # def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
+        #     self._all_gather_block_weight(gpc.config.NUM_LAYER - 1)
 
         def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
             block_index = self.block_to_index[block]
@@ -588,45 +596,58 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output):
             block_index = self.module_to_index[module]
             name_index = self.module_name_index[module]
             if block_index != 0:
-                # if name_index == 4:
-                #     total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                #     weight_handler.wait()
-                #     self.FSTP_global_weights[module] = total_weight
-
-                #     # start the all-gather for next module
-                #     next_module = self.block_module[block_index][name_index - 1]
-                #     self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                #         next_module.weight, self.process_group, async_op=True
-                #     )
-                #     self.FSTP_global_handle[next_module] = weights_handler
-                # else:
-                #     handler = self.FSTP_global_handle[module]
-                #     handler.wait()
-                #     if name_index != 0:
-                #         next_module = self.block_module[block_index][name_index - 1]
-                #         self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                #             next_module.weight, self.process_group, async_op=True
-                #         )
-                #         self.FSTP_global_handle[next_module] = weights_handler
-                if module in self.FSTP_global_handle:
+                if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
+                    total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                    weight_handler.wait()
+                    self.FSTP_global_weights[module] = total_weight
+
+                    # start the all-gather for next module
+                    next_module = self.block_module[block_index][name_index - 1]
+                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                        next_module.weight, self.process_group, async_op=True
+                    )
+                    self.FSTP_global_handle[next_module] = weights_handler
+                elif name_index == 0:
+                    handler = self.FSTP_global_handle[module]
+                    handler.wait()
+                    
+                    if block_index - 1 > 0:
+                        next_module = self.block_module[block_index - 1][4]
+                        self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                            next_module.weight, self.process_group, async_op=True
+                        )
+                        self.FSTP_global_handle[next_module] = weights_handler
+                else:
                     handler = self.FSTP_global_handle[module]
                     handler.wait()
+                    if name_index != 0:
+                        next_module = self.block_module[block_index][name_index - 1]
+                        self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                            next_module.weight, self.process_group, async_op=True
+                        )
+                        self.FSTP_global_handle[next_module] = weights_handler
+                # if module in self.FSTP_global_handle:
+                #     handler = self.FSTP_global_handle[module]
+                #     handler.wait()
 
         def _post_backward_hook_for_module(module, grad_input, grad_output):
             if module in self.FSTP_global_weights:
                 del self.FSTP_global_weights[module]
             if module in self.FSTP_global_handle:
                 del self.FSTP_global_handle[module]
+        
+        # for head in self.head:
+        #     head.register_full_backward_hook(_post_backward_hook_for_head)
 
-        for block in self.FSTP_blocks:
+        # for block in self.FSTP_blocks:
             # block.register_forward_pre_hook(_pre_forward_hook_for_block)
             # block.register_forward_hook(_post_forward_hook_for_block)
-            block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
+            # block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
             # block.register_full_backward_hook(_post_backward_hook_for_block)
 
         for out_proj in self.FSTP_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
-
+   
         # for wqkv in self.FSTP_wqkvs:
         #     wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv)
 

From 5c38cb64095513c3740e9618c41e143608169ab5 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 17 Oct 2023 15:38:24 +0800
Subject: [PATCH 029/153] add head overlap

---
 internlm/model/linear.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 16b0c85f..71bdf057 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -596,8 +596,11 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output):
             if block_index - 1 > 0:
                 self._all_gather_block_weight(block_index - 1)
         
-        # def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
-        #     self._all_gather_block_weight(gpc.config.NUM_LAYER - 1)
+        def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
+            first_module = self.block_module[gpc.config.NUM_LAYER - 1][4]
+            total_weight, weight_handler = all_gather_raw(first_module.weight, self.process_group, async_op=True)
+            self.FSTP_global_handle[first_module] = weight_handler
+            self.FSTP_global_weights[first_module] = total_weight
 
         def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
             block_index = self.block_to_index[block]
@@ -612,9 +615,10 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output):
             name_index = self.module_name_index[module]
             if block_index != 0:
                 if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
-                    total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                    # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                    weight_handler = self.FSTP_global_handle[module]
                     weight_handler.wait()
-                    self.FSTP_global_weights[module] = total_weight
+                    # self.FSTP_global_weights[module] = total_weight
 
                     # start the all-gather for next module
                     next_module = self.block_module[block_index][name_index - 1]
@@ -651,8 +655,8 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):
             if module in self.FSTP_global_handle:
                 del self.FSTP_global_handle[module]
         
-        # for head in self.head:
-        #     head.register_full_backward_hook(_post_backward_hook_for_head)
+        for head in self.head:
+            head.register_full_backward_hook(_post_backward_hook_for_head)
 
         # for block in self.FSTP_blocks:
             # block.register_forward_pre_hook(_pre_forward_hook_for_block)

From 5abe519c4c9806ecce76b29dcd88f738c1014d67 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 17 Oct 2023 16:37:06 +0800
Subject: [PATCH 030/153] remove full weight for block 0

---
 internlm/model/linear.py               | 152 +++++++++++--------------
 internlm/model/multi_head_attention.py |  50 +++-----
 2 files changed, 85 insertions(+), 117 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 71bdf057..cc9524a1 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -12,6 +12,7 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
+from internlm.model.embedding import Embedding1D
 from internlm.model.utils import (
     Silu,
     all_gather_raw,
@@ -255,56 +256,33 @@ def __init__(
 
         hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
 
-        if block_idx == 0 and gpc.config.parallel.block_0_full_weight:
-            self.w1 = nn.Linear(
-                in_features,
-                hidden_features,
-                bias,
-                device=device,
-                dtype=dtype,
-            )
-            self.w2 = nn.Linear(
-                in_features,
-                hidden_features,
-                bias,
-                device=device,
-                dtype=dtype,
-            )
-            self.w3 = nn.Linear(
-                hidden_features,
-                out_features,
-                bias=bias,
-                device=device,
-                dtype=dtype,
-            )
-        else:
-            self.w1 = FSTPLinear(
-                in_features,
-                hidden_features,
-                process_group,
-                bias,
-                sequence_parallel=gpc.config.parallel.sequence_parallel,
-                device=device,
-                dtype=dtype,
-            )
-            self.w2 = FSTPLinear(
-                in_features,
-                hidden_features,
-                process_group,
-                bias,
-                sequence_parallel=gpc.config.parallel.sequence_parallel,
-                device=device,
-                dtype=dtype,
-            )
-            self.w3 = FSTPLinear(
-                hidden_features,
-                out_features,
-                process_group,
-                bias=bias,
-                sequence_parallel=gpc.config.parallel.sequence_parallel,
-                device=device,
-                dtype=dtype,
-            )
+        self.w1 = FSTPLinear(
+            in_features,
+            hidden_features,
+            process_group,
+            bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+        self.w2 = FSTPLinear(
+            in_features,
+            hidden_features,
+            process_group,
+            bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
+        self.w3 = FSTPLinear(
+            hidden_features,
+            out_features,
+            process_group,
+            bias=bias,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            device=device,
+            dtype=dtype,
+        )
 
     def forward(self, x):
         w1_o = self.w1(x)
@@ -458,6 +436,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
         self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
         self.head = []
+        self.embedding = []
 
         self.reduce_scatter_handlers = {}
 
@@ -505,6 +484,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                 continue
                 elif isinstance(children, ScaleColumnParallelLinear):
                     self.head.append(children)
+                elif isinstance(children, Embedding1D):
+                    self.embedding.append(children)
 
     def _all_gather_block_weight(self, block_index: int):
         block = self.index_to_block[block_index]
@@ -532,7 +513,6 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
             # start the all-gather for next block
             if block_index + 1 < gpc.config.NUM_LAYER:
                 self._all_gather_block_weight(block_index + 1)
-                # print(f"_all_gather_block_weight for block {block_index+1}", flush=True)
 
         def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):
             block_index = self.block_to_index[block]
@@ -548,6 +528,10 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):
                 handles = self.block_handles[block]
                 for handle in handles:
                     handle.wait()
+        
+        def _pre_forward_hook_for_embedding(module: nn.Module, inputs: Any, output):
+            self._all_gather_block_weight(0)
+            
 
         def _post_forward_hook_for_block(block: nn.Module, input, output):
             block_index = self.block_to_index[block]
@@ -557,11 +541,10 @@ def _post_forward_hook_for_block(block: nn.Module, input, output):
             for module in fsdp_modules:
                 del self.FSTP_global_weights[module]
 
-        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
+        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any,):
             block_index = self.module_to_index[module]
-            if block_index != 0:
-                handler = self.FSTP_global_handle[module]
-                handler.wait()
+            handler = self.FSTP_global_handle[module]
+            handler.wait()
 
         def _post_forward_hook_for_module(module: nn.Module, input, output):
             if module in self.FSTP_global_weights:
@@ -593,7 +576,7 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output):
             # if block_index == gpc.config.NUM_LAYER - 1:
             #     self._all_gather_block_weight(block_index)
             # start the all-gather for next block
-            if block_index - 1 > 0:
+            if block_index - 1 >= 0:
                 self._all_gather_block_weight(block_index - 1)
         
         def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
@@ -613,38 +596,38 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
         def _pre_backward_hook_for_module(module: nn.Module, grad_output):
             block_index = self.module_to_index[module]
             name_index = self.module_name_index[module]
-            if block_index != 0:
-                if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
-                    # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                    weight_handler = self.FSTP_global_handle[module]
-                    weight_handler.wait()
-                    # self.FSTP_global_weights[module] = total_weight
-
-                    # start the all-gather for next module
+            
+            if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
+                # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                weight_handler = self.FSTP_global_handle[module]
+                weight_handler.wait()
+                # self.FSTP_global_weights[module] = total_weight
+
+                # start the all-gather for next module
+                next_module = self.block_module[block_index][name_index - 1]
+                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                    next_module.weight, self.process_group, async_op=True
+                )
+                self.FSTP_global_handle[next_module] = weights_handler
+            elif name_index == 0:
+                handler = self.FSTP_global_handle[module]
+                handler.wait()
+                
+                if block_index - 1 >= 0:
+                    next_module = self.block_module[block_index - 1][4]
+                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
+                        next_module.weight, self.process_group, async_op=True
+                    )
+                    self.FSTP_global_handle[next_module] = weights_handler
+            else:
+                handler = self.FSTP_global_handle[module]
+                handler.wait()
+                if name_index != 0:
                     next_module = self.block_module[block_index][name_index - 1]
                     self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
                         next_module.weight, self.process_group, async_op=True
                     )
                     self.FSTP_global_handle[next_module] = weights_handler
-                elif name_index == 0:
-                    handler = self.FSTP_global_handle[module]
-                    handler.wait()
-                    
-                    if block_index - 1 > 0:
-                        next_module = self.block_module[block_index - 1][4]
-                        self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                            next_module.weight, self.process_group, async_op=True
-                        )
-                        self.FSTP_global_handle[next_module] = weights_handler
-                else:
-                    handler = self.FSTP_global_handle[module]
-                    handler.wait()
-                    if name_index != 0:
-                        next_module = self.block_module[block_index][name_index - 1]
-                        self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                            next_module.weight, self.process_group, async_op=True
-                        )
-                        self.FSTP_global_handle[next_module] = weights_handler
                 # if module in self.FSTP_global_handle:
                 #     handler = self.FSTP_global_handle[module]
                 #     handler.wait()
@@ -655,6 +638,9 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):
             if module in self.FSTP_global_handle:
                 del self.FSTP_global_handle[module]
         
+        for embedding in self.embedding:
+            embedding.register_forward_hook(_pre_forward_hook_for_embedding)
+        
         for head in self.head:
             head.register_full_backward_hook(_post_backward_hook_for_head)
 
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 6c1e7d89..7a0f4ed7 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -205,23 +205,14 @@ def __init__(
 
         # notice here should change bias=True
         Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
-        if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight:
-            Wqkv_cls = nn.Linear
-            self.Wqkv = Wqkv_cls(
-                embed_dim,
-                3 * embed_dim,
-                bias=False,
-                **factory_kwargs,
-            )
-        else:
-            self.Wqkv = Wqkv_cls(
-                embed_dim,
-                3 * embed_dim,
-                process_group,
-                bias=False,
-                sequence_parallel=gpc.config.parallel.sequence_parallel,
-                **factory_kwargs,
-            )  # according to https://spaces.ac.cn/archives/9577
+        self.Wqkv = Wqkv_cls(
+            embed_dim,
+            3 * embed_dim,
+            process_group,
+            bias=False,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            **factory_kwargs,
+        )  # according to https://spaces.ac.cn/archives/9577
 
         inner_attn_cls = FlashSelfAttention if use_flash_attn else SelfAttention
         inner_cross_attn_cls = FlashCrossAttention if use_flash_attn else CrossAttention
@@ -235,23 +226,14 @@ def __init__(
 
         # output projection always have the bias (for now)
         out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
-        if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight:
-            out_proj_cls = nn.Linear
-            self.out_proj = out_proj_cls(
-                embed_dim,
-                embed_dim,
-                bias=False,
-                **factory_kwargs,
-            )
-        else:
-            self.out_proj = out_proj_cls(
-                embed_dim,
-                embed_dim,
-                process_group,
-                bias=False,
-                sequence_parallel=gpc.config.parallel.sequence_parallel,
-                **factory_kwargs,
-            )
+        self.out_proj = out_proj_cls(
+            embed_dim,
+            embed_dim,
+            process_group,
+            bias=False,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            **factory_kwargs,
+        )
         # need to assign tp attribute so that internlm know it is tensor parallel module
         if gpc.get_world_size(ParallelMode.TENSOR) > 1:
             for name in ["out_proj", "Wqkv"]:

From 16ef7b788915bcf222ae96fdc556f168c3c9c6b7 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 17 Oct 2023 17:16:39 +0800
Subject: [PATCH 031/153] add test

---
 configs/13B_sft.py                            | 180 ++++++++++++++++++
 configs/20B_sft.py                            | 180 ++++++++++++++++++
 configs/30B_sft.py                            | 180 ++++++++++++++++++
 configs/7B_sft.py                             |   7 +-
 .../solver/optimizer/hybrid_zero_optim.py     |   5 +-
 internlm/train/training_internlm.py           |   2 +-
 6 files changed, 547 insertions(+), 7 deletions(-)
 create mode 100644 configs/13B_sft.py
 create mode 100644 configs/20B_sft.py
 create mode 100644 configs/30B_sft.py

diff --git a/configs/13B_sft.py b/configs/13B_sft.py
new file mode 100644
index 00000000..e3e17ae0
--- /dev/null
+++ b/configs/13B_sft.py
@@ -0,0 +1,180 @@
+JOB_NAME = "13b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, mode="origin_tp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=True,
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/20B_sft.py b/configs/20B_sft.py
new file mode 100644
index 00000000..1d093efc
--- /dev/null
+++ b/configs/20B_sft.py
@@ -0,0 +1,180 @@
+JOB_NAME = "13b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=4,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, mode="fstp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=True,
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_sft.py b/configs/30B_sft.py
new file mode 100644
index 00000000..5ac67451
--- /dev/null
+++ b/configs/30B_sft.py
@@ -0,0 +1,180 @@
+JOB_NAME = "13b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=4,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, mode="fstp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=True,
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 36f9ac14..106548a2 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -2,10 +2,10 @@
 DO_ALERT = False
 
 SEQ_LEN = 4096
-HIDDEN_SIZE = 8192
+HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 4
+NUM_LAYER = 32
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@@ -51,7 +51,7 @@
     # micro_num means the number of micro_batch contained in one gradient update
     micro_num=4,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
+    micro_bsz=4,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -165,7 +165,6 @@
     tensor=dict(size=8, mode="fstp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
-    block_0_full_weight=True,
 )
 
 cudnn_deterministic = False
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 950d35e8..c7c10071 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -64,8 +64,9 @@ def __init__(
         backoff_factor = grad_scal_cfg.backoff_factor
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
-
-        self._fstp_handler = gpc.config.fstp_handler
+        
+        if gpc.config.parallel["tensor"]["mode"] == "fstp":
+            self._fstp_handler = gpc.config.fstp_handler
 
         # Zero related args
         reduce_bucket_size = zero_cfg.reduce_bucket_size
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index cc310a21..93903a38 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -578,6 +578,6 @@ def record_current_batch_training_metrics(
             print(tgs_list, flush=True)
             avg_tgs = sum(tgs_list) / len(tgs_list)
             for tgs in tgs_list.copy():
-                if abs(tgs - avg_tgs) > 1000:
+                if abs(tgs - avg_tgs) > 400:
                     tgs_list.remove(tgs)
             print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True)

From a5aeab2a3f06c7b07e302f911c2bd6ae2a69362e Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 17 Oct 2023 19:54:21 +0800
Subject: [PATCH 032/153] memory profiling test

---
 configs/20B_sft.py                            |  4 +--
 internlm/model/linear.py                      | 12 +-------
 .../solver/optimizer/hybrid_zero_optim.py     | 29 +++++++++++++++----
 train.py                                      |  2 ++
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/configs/20B_sft.py b/configs/20B_sft.py
index 1d093efc..bc63d346 100644
--- a/configs/20B_sft.py
+++ b/configs/20B_sft.py
@@ -1,4 +1,4 @@
-JOB_NAME = "13b_train"
+JOB_NAME = "20b_train"
 DO_ALERT = False
 
 SEQ_LEN = 4096
@@ -51,7 +51,7 @@
     # micro_num means the number of micro_batch contained in one gradient update
     micro_num=4,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=4,
+    micro_bsz=2,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index cc9524a1..0ea6ee30 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -423,7 +423,6 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.process_group = process_group
         self.FSTP_blocks = []
         self.FSTP_outs = []
-        self.FSTP_wqkvs = []
         self.FSTP_modules = []
         self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
         self.FSTP_global_handle = dict()  # key: FSTP module; value: module global all-gather op handle
@@ -465,9 +464,6 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                     if name == "out_proj":
                                         self.FSTP_outs.append(child)
                                         self.module_to_index[child] = idx
-                                    if name == "Wqkv":
-                                        self.FSTP_wqkvs.append(child)
-                                        self.module_to_index[child] = idx
                                     if isinstance(child, FSTPLinear):
                                         self.module_to_index[child] = idx
                                         self.block_module[idx][index] = child
@@ -488,7 +484,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                     self.embedding.append(children)
 
     def _all_gather_block_weight(self, block_index: int):
-        block = self.index_to_block[block_index]
+        #block = self.index_to_block[block_index]
         fsdp_modules = self.index_to_fsdp_modules[block_index]
         # self.block_handles[block] = []
         for module in fsdp_modules:
@@ -552,12 +548,6 @@ def _post_forward_hook_for_module(module: nn.Module, input, output):
             if module in self.FSTP_global_handle:
                 del self.FSTP_global_handle[module]
 
-        def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output):
-            block_index = self.module_to_index[module]
-            # start the all-gather for next block
-            if block_index - 1 >= 0:
-                self._all_gather_block_weight(block_index - 1)
-
         def _pre_backward_hook_for_block(block: nn.Module, grad_output):
             # import pdb; pdb.set_trace()
             block_index = self.block_to_index[block]
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index c7c10071..d2268274 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -39,6 +39,14 @@
 inf = math.inf
 logger = get_logger(__file__)
 
+def print_memory(msg):
+    
+    if gpc.get_global_rank() == 0:
+        print(msg, flush=True)
+        print("memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, flush=True)
+        print("max memory allocated: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
+        print("===========================================")
+
 
 class HybridZeroOptimizer(BaseOptimizer):
     """
@@ -335,6 +343,7 @@ def reset_reduce_bucket(self) -> None:
                     comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
                     comm_handle.wait()
                     _param.grad += _grad
+                    self._fstp_handler.reduce_scatter_handlers[key] = None
 
                 bucket.reset_by_rank(rank)
 
@@ -358,6 +367,7 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
                 comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
                 comm_handle.wait()
                 _param.grad += _grad
+                self._fstp_handler.reduce_scatter_handlers[key] = None
 
             # reduce grad
             if self.skip_grad_reduce is False:
@@ -565,6 +575,7 @@ def step(self, closure=None):
 
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients
+        print_memory("No 1")
         if not self._overlap_sync_grad:
             for group_id in range(len(self._fp16_param_groups)):
                 for param in self._fp16_param_groups[group_id]:
@@ -589,7 +600,7 @@ def step(self, closure=None):
             bucket.empty()
         self._bucket_in_progress = []
         self._param_store.clear_grads_of_previous_reduced_params()
-
+        print_memory("No 2")
         # compute norm for gradients in the last bucket
         total_norms = {}
         for group_id in range(self.num_param_groups):
@@ -611,10 +622,12 @@ def step(self, closure=None):
                 scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float)
                 dist.all_reduce(scaled_norm_tensor, group=pg)
                 total_norms[group_name] = scaled_norm_tensor.item()
-
+        print_memory("No 3")
         timer("sync_grad").start()
         self._sync_grad()
         timer("sync_grad").stop()
+        
+        print_memory("No 4")
 
         return self._step(closure=closure, norms=total_norms)
 
@@ -661,7 +674,7 @@ def _step(self, closure=None, norms=None):
             self._grad_store._averaged_gradients = dict()
             self.zero_grad()
             return False, norms
-
+        print_memory("No 5")
         # copy the grad of fp16 param to fp32 param
         single_grad_partition_groups = []
         for group_id in range(self.num_param_groups):
@@ -702,7 +715,7 @@ def _step(self, closure=None, norms=None):
             single_grad_partition_groups.append(flat_fp32_avg_grads)
             device = self._fp32_flat_param_groups_of_current_rank[group_id].device
             self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
-
+        print_memory("No 6")
         # unscale and clip grads
         # get the global norm
         global_norm_groups = {}
@@ -725,9 +738,12 @@ def _step(self, closure=None, norms=None):
         # For those ranks that are not assigned parameters, we just wait for other ranks
         # to send them updated their own parameters.
         if self.has_params:
+            print_memory("No 7")
             self.optim.step()
+            print_memory("No 8")
             # release the fp32 grad
             release_param_grad(self._fp32_flat_param_groups_of_current_rank.values())
+            print_memory("No 9")
             # update fp16 partition updated by the current rank
             for group_id in range(len(self._fp16_param_groups)):
                 if self.param_group_has_params[group_id]:
@@ -736,17 +752,18 @@ def _step(self, closure=None, norms=None):
                     )
                     fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
                     fp16_param.data.copy_(fp32_param)
-
+            print_memory("No 10")
         torch.cuda.synchronize()
         with torch.cuda.stream(self._comm_bcast_stream):
             self.broadcast_params()
-
+        
         timer("step").stop()
 
         # update gradients may not be needed here, because the sync_params function is used in initialization,
         # so synchronization is maintained
         for group_name, global_norm in global_norm_groups.items():
             global_norm_groups[group_name] = global_norm / loss_scale
+        print_memory("No 11")
         return True, global_norm_groups
 
     def broadcast_params(self):
diff --git a/train.py b/train.py
index 139bac1f..0a84f592 100644
--- a/train.py
+++ b/train.py
@@ -296,6 +296,8 @@ def main(args):
 
             if batch_count % 2 == 0:
                 prof.step()
+            
+            torch.cuda.reset_peak_memory_stats()
 
     ckpt_manager.wait_async_upload_finish()
 

From 47422711545e9332708be2ad26b64996bb9c1447 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Thu, 19 Oct 2023 13:21:33 +0800
Subject: [PATCH 033/153] add memory pool

---
 configs/20B_sft.py                            |   4 +-
 configs/30B_sft.py                            |   4 +-
 internlm/model/linear.py                      |  83 +++++++++++++-
 internlm/model/modeling_internlm.py           |   1 -
 internlm/model/utils.py                       |  98 +++++++++++-----
 .../solver/optimizer/hybrid_zero_optim.py     | 108 ++++++++++++++----
 internlm/solver/optimizer/store.py            |   3 +
 internlm/train/training_internlm.py           |  36 +++++-
 internlm/utils/gputest.py                     |  15 ++-
 train.py                                      |   7 +-
 10 files changed, 295 insertions(+), 64 deletions(-)

diff --git a/configs/20B_sft.py b/configs/20B_sft.py
index bc63d346..5a9021be 100644
--- a/configs/20B_sft.py
+++ b/configs/20B_sft.py
@@ -57,7 +57,7 @@
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=False,
-    total_steps=20,
+    total_steps=50,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
@@ -162,7 +162,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="fstp"),
+    tensor=dict(size=8, mode="fstp", overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/configs/30B_sft.py b/configs/30B_sft.py
index 5ac67451..ec040480 100644
--- a/configs/30B_sft.py
+++ b/configs/30B_sft.py
@@ -1,4 +1,4 @@
-JOB_NAME = "13b_train"
+JOB_NAME = "30b_train"
 DO_ALERT = False
 
 SEQ_LEN = 4096
@@ -162,7 +162,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="fstp"),
+    tensor=dict(size=8, mode="origin_tp", overlap=False),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 0ea6ee30..4f05cd32 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -16,6 +16,7 @@
 from internlm.model.utils import (
     Silu,
     all_gather_raw,
+    all_gather_raw_memory_pool,
     fstp_fused_dense_func,
     fused_dense_func_torch,
 )
@@ -219,8 +220,12 @@ def forward(self, x):
 
 class FSTPLinear(ColumnParallelLinear):
     def forward(self, x):
+        block_index = gpc.config.fstp_handler.module_to_index[self]
+        name_index = gpc.config.fstp_handler.module_name_index[self]
+        name = gpc.config.fstp_handler.module_name[name_index]
         return fstp_fused_dense_func(
-            x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler
+            x, self.weight, self.bias, process_group=self.process_group, 
+            module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name
         )
 
 
@@ -308,6 +313,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
 
         self.reduce_scatter_handlers = {}
+        self.all_reduce_handlers = {}
 
         # just want to share same for loop for ModuleList and Module
         if not isinstance(model, nn.ModuleList):
@@ -438,6 +444,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.embedding = []
 
         self.reduce_scatter_handlers = {}
+        self.all_reduce_handlers = {}
+        self.zero_const_pool = {}
 
         # just want to share same for loop for ModuleList and Module
         if not isinstance(model, nn.ModuleList):
@@ -476,12 +484,23 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                         setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
                                         if child.bias is not None:
                                             setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
+                                        # _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
+                                        # setattr(child.weight, "_fstp_all_reduce_str", f"{_full_name}.weight")
+                                        # if child.bias is not None:
+                                        #     setattr(child.bias, "_fstp_all_reduce_str", f"{_full_name}.bias")
                             else:
                                 continue
                 elif isinstance(children, ScaleColumnParallelLinear):
                     self.head.append(children)
                 elif isinstance(children, Embedding1D):
                     self.embedding.append(children)
+                    
+    def get_zero_by_shape(self, size:tuple, dtype, device) -> torch.Tensor:
+        if size not in self.zero_const_pool:        
+            self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
+        
+        return self.zero_const_pool[size]
+
 
     def _all_gather_block_weight(self, block_index: int):
         #block = self.index_to_block[block_index]
@@ -492,6 +511,17 @@ def _all_gather_block_weight(self, block_index: int):
             self.FSTP_global_weights[module] = total_weight
             self.FSTP_global_handle[module] = weight_handle
             # self.block_handles[block].append(weight_handle)
+    
+    def _all_gather_block_weight_memory_pool(self, block_index: int):
+        fsdp_modules = self.index_to_fsdp_modules[block_index]
+        # self.block_handles[block] = []
+        for module in fsdp_modules:
+            module_index = self.module_name_index[module]
+            name = self.module_name[module_index]
+            weight_handle = all_gather_raw_memory_pool(module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name)
+            # self.FSTP_global_weights[module] = total_weight
+            self.FSTP_global_handle[module] = weight_handle
+            # self.block_handles[block].append(weight_handle)
 
     def _register_sync_parameters_hook(self) -> None:
         """
@@ -508,7 +538,8 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
             block_index = self.module_to_index[module]
             # start the all-gather for next block
             if block_index + 1 < gpc.config.NUM_LAYER:
-                self._all_gather_block_weight(block_index + 1)
+                # self._all_gather_block_weight(block_index + 1)
+                self._all_gather_block_weight_memory_pool(block_index + 1)
 
         def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):
             block_index = self.block_to_index[block]
@@ -526,7 +557,8 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):
                     handle.wait()
         
         def _pre_forward_hook_for_embedding(module: nn.Module, inputs: Any, output):
-            self._all_gather_block_weight(0)
+            # self._all_gather_block_weight(0)
+            self._all_gather_block_weight_memory_pool(0)
             
 
         def _post_forward_hook_for_block(block: nn.Module, input, output):
@@ -583,6 +615,48 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
             for module in fsdp_modules:
                 del self.FSTP_global_weights[module]
 
+        def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output):
+            block_index = self.module_to_index[module]
+            name_index = self.module_name_index[module]
+            
+            if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
+                # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
+                weight_handler = self.FSTP_global_handle[module]
+                weight_handler.wait()
+                # self.FSTP_global_weights[module] = total_weight
+
+                # start the all-gather for next module
+                next_module = self.block_module[block_index][name_index - 1]
+                next_name = self.module_name[name_index - 1]
+                weights_handler = all_gather_raw_memory_pool(
+                    next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=next_name
+                )
+                self.FSTP_global_handle[next_module] = weights_handler
+            elif name_index == 0:
+                handler = self.FSTP_global_handle[module]
+                handler.wait()
+                
+                if block_index - 1 >= 0:
+                    next_module = self.block_module[block_index - 1][4]
+                    name = self.module_name[4]
+                    weights_handler = all_gather_raw_memory_pool(
+                        next_module.weight, self.process_group, async_op=True, block_index=block_index - 1, module_name=name,
+                    )
+                    self.FSTP_global_handle[next_module] = weights_handler
+            else:
+                handler = self.FSTP_global_handle[module]
+                handler.wait()
+                if name_index != 0:
+                    next_module = self.block_module[block_index][name_index - 1]
+                    name = self.module_name[name_index - 1]
+                    weights_handler = all_gather_raw_memory_pool(
+                        next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name
+                    )
+                    self.FSTP_global_handle[next_module] = weights_handler
+                # if module in self.FSTP_global_handle:
+                #     handler = self.FSTP_global_handle[module]
+                #     handler.wait()
+
         def _pre_backward_hook_for_module(module: nn.Module, grad_output):
             block_index = self.module_to_index[module]
             name_index = self.module_name_index[module]
@@ -649,5 +723,6 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):
         for module in self.FSTP_modules:
             module.register_forward_pre_hook(_pre_forward_hook_for_module)
             module.register_forward_hook(_post_forward_hook_for_module)
-            module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
+            # module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
+            module.register_full_backward_pre_hook(_pre_backward_hook_for_module_memory_pool)
             module.register_full_backward_hook(_post_backward_hook_for_module)
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index cb933960..b004dffa 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -125,7 +125,6 @@ def __init__(
                 bias=False,
                 device=device,
                 dtype=dtype,
-                block_idx=block_idx,
             )
         else:
             self.mlp = ParallelFusedMLP(
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 0194e84a..5b4018c8 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -6,7 +6,7 @@
 import fused_dense_lib as fused_dense_cuda
 import torch
 import torch.nn.functional as F
-from flash_attn.utils.distributed import all_reduce_raw, reduce_scatter_raw
+from flash_attn.utils.distributed import all_reduce_raw #, reduce_scatter_raw
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
@@ -124,6 +124,12 @@ def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool =
     )
     return output, handle
 
+def all_gather_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0, block_index: int = None, module_name: str = None):
+    handle = torch.distributed.all_gather_into_tensor(
+        gpc.config.block_memory[block_index % 2][module_name], input_.contiguous(), group=process_group, async_op=async_op
+    )
+    return handle
+
 
 def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
     assert my_input.dtype == grad_output.dtype
@@ -132,6 +138,17 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
     return grad_weight, grad_bias
 
 
+def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+    world_size = torch.distributed.get_world_size(process_group)
+    assert input_.shape[0] % world_size == 0
+    output = torch.empty(input_.shape[0] // world_size, *input_.shape[1:],
+                         dtype=input_.dtype, device=input_.device).contiguous()
+    handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(),
+                                                     group=process_group,
+                                                     async_op=async_op)
+    return output, handle
+
+
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFunc(torch.autograd.Function):
     "tp fused dense function"
@@ -283,12 +300,14 @@ class FSTPFusedDenseFunc(torch.autograd.Function):
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None):
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None, block_index=None, module_name=None):
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
         ctx.process_group = process_group
         ctx.all_gather_handler = all_gather_handler
         ctx.module = module
+        ctx.block_index = block_index
+        ctx.module_name = module_name
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
@@ -297,8 +316,9 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
-            if module in all_gather_handler.FSTP_global_weights:
-                total_weight = all_gather_handler.FSTP_global_weights[module]
+            if all_gather_handler is not None:# and module in all_gather_handler.FSTP_global_weights:
+                    # total_weight = all_gather_handler.FSTP_global_weights[module]
+                    total_weight = gpc.config.block_memory[block_index % 2][module_name]   
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
@@ -323,6 +343,8 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod
         if min(batch_dim, n, *total_weight.shape) > 65535 * 32:
             raise RuntimeError("fused_dense only supports matrix dims <= 2M")
         output = F.linear(total_x, total_weight, total_bias)
+        del total_weight
+        del total_bias
         if ctx.compute_weight_gradient:
             ctx.save_for_backward(x, weight, bias)
         else:
@@ -339,6 +361,9 @@ def backward(ctx, grad_output, *args):
         process_group = ctx.process_group
         all_gather_handler = ctx.all_gather_handler
         module = ctx.module
+        block_index = ctx.block_index
+        module_name = ctx.module_name
+        
         if ctx.compute_weight_gradient:
             x, weight, bias = ctx.saved_tensors
             total_x = x
@@ -351,12 +376,13 @@ def backward(ctx, grad_output, *args):
 
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
-            # do all-gather for weight before backward
-            if module in all_gather_handler.FSTP_global_weights:
-                total_weight = all_gather_handler.FSTP_global_weights[module]
-            else:
-                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-                handle_weight.wait()
+            total_weight = gpc.config.block_memory[block_index % 2][module_name]
+            # # do all-gather for weight before backward
+            # if module in all_gather_handler.FSTP_global_weights:
+            #     total_weight = all_gather_handler.FSTP_global_weights[module]
+            # else:
+            #     total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+            #     handle_weight.wait()
         else:
             total_weight = weight
 
@@ -368,15 +394,32 @@ def backward(ctx, grad_output, *args):
                 total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
             )
             if world_size > 1:
-                grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
-                assert hasattr(weight, "_fstp_reduce_scatter_str")
-                all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
-                grad_weight = torch.zeros(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device)
-                if grad_bias is not None:
-                    grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
-                    assert hasattr(bias, "_fstp_reduce_scatter_str")
-                    all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
-                    grad_bias = torch.zeros(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device)
+                if gpc.config.fstp_handler is not None:
+                    # grad_weight_async, handle_grad_weight = all_reduce_raw(grad_weight, process_group, async_op=True)
+                    # assert hasattr(weight, "_fstp_all_reduce_str")
+                    # all_gather_handler.all_reduce_handlers[weight._fstp_all_reduce_str] = (handle_grad_weight, grad_weight_async)
+                    # grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
+                    # if grad_bias is not None:
+                    #     grad_bias_async, handle_grad_bias = all_reduce_raw(grad_bias, process_group, async_op=True)
+                    #     assert hasattr(bias, "_fstp_all_reduce_str")
+                    #     all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async)
+                    #     grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
+                    grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                    assert hasattr(weight, "_fstp_reduce_scatter_str")
+                    all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
+                    grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
+                    if grad_bias is not None:
+                        grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                        assert hasattr(bias, "_fstp_reduce_scatter_str")
+                        all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
+                        grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
+                else:
+                    grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                    if grad_bias is not None:
+                        grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                # grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                # if grad_bias is not None:
+                #     grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
@@ -389,13 +432,14 @@ def backward(ctx, grad_output, *args):
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
         else:
             grad_input = None
+        del total_weight
 
-        # if ctx.needs_input_grad[1]:
-        #     if world_size > 1:
-        #         handle_grad_weight.wait()
-        #         if grad_bias is not None:
-        #             handle_grad_bias.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None, None
+        if ctx.needs_input_grad[1]:
+            if world_size > 1 and gpc.config.fstp_handler is None:
+                handle_grad_weight.wait()
+                if grad_bias is not None:
+                    handle_grad_bias.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
 
 
 def fused_dense_func_torch(
@@ -424,12 +468,14 @@ def fstp_fused_dense_func(
     process_group=None,
     module=None,
     handler=None,
+    block_index=None,
+    module_name=None,
 ):
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler)
+        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler, block_index, module_name)
     else:
         assert process_group is None
         out = F.linear(x, weight, bias)
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index d2268274..d0cdd101 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -10,6 +10,7 @@
 
 from internlm.core.context import Config, ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.model.utils import split_forward_gather_backward
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
     BucketStore,
@@ -40,12 +41,8 @@
 logger = get_logger(__file__)
 
 def print_memory(msg):
-    
-    if gpc.get_global_rank() == 0:
-        print(msg, flush=True)
-        print("memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, flush=True)
-        print("max memory allocated: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
-        print("===========================================")
+    print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
+    print("===========================================")
 
 
 class HybridZeroOptimizer(BaseOptimizer):
@@ -73,7 +70,7 @@ def __init__(
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
         
-        if gpc.config.parallel["tensor"]["mode"] == "fstp":
+        if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
             self._fstp_handler = gpc.config.fstp_handler
 
         # Zero related args
@@ -94,6 +91,7 @@ def __init__(
         self._param_store = ParameterStore(ParallelMode.ZERO1)
         self._grad_store = GradientStore(ParallelMode.DATA)
         self._bucket_store = []
+        self._bucket_store_2 = []
         self._bucket_in_progress = []
 
         # fp16 and fp32 params for mixed precision training
@@ -162,6 +160,7 @@ def __init__(
             # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
             self._broadcast_parallel_mode.append(zero_mode)
             self._bucket_store.append(BucketStore(group_id, param_group["dp_mode"]))
+            self._bucket_store_2.append(BucketStore(group_id, param_group["dp_mode"]))
 
             # assign parameters to ranks the params in the list are sorted
             params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group)
@@ -307,12 +306,22 @@ def _define_and_attach(param, reduce_rank=None):
                             param=param,
                             reduce_rank=reduce_rank,
                         )
+                        
+                        reduce_scatter_checker = partial(
+                            self._wait_reduce_scatter_and_accumulate_grad,
+                            param=param,
+                            reduce_rank=reduce_rank,
+                        )
 
                         # define hook
                         # NOT IMPORTANT BUT GOOD TO KNOW:
                         # args here is not grad, but allow_unreacable and accumulate_grad
                         def reduce_grad_hook(*args):  # pylint: disable=W0613
-                            reduction_func()
+                            if gpc.config.fstp_handler is not None:
+                                reduce_scatter_checker()
+
+                            if self.skip_grad_reduce is False:
+                                reduction_func()
 
                         accum_grad_obj.register_hook(reduce_grad_hook)
 
@@ -333,7 +342,7 @@ def belongs_to_current_rank(self, param) -> bool:
         return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id])
 
     def reset_reduce_bucket(self) -> None:
-        for bucket in self._bucket_store:
+        for bucket in self._bucket_store_2:
             for rank, params in bucket._params.items():
                 for _param in params:
                     if not hasattr(_param, "_fstp_reduce_scatter_str"):
@@ -342,21 +351,39 @@ def reset_reduce_bucket(self) -> None:
                     key = getattr(_param, "_fstp_reduce_scatter_str")
                     comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
                     comm_handle.wait()
-                    _param.grad += _grad
+                    _param.grad.add_(_grad)
+                    # self._fstp_handler.reduce_scatter_handlers[key] = None
+                    del _grad
+                    del self._fstp_handler.reduce_scatter_handlers[key]
                     self._fstp_handler.reduce_scatter_handlers[key] = None
+                    assert key in self._fstp_handler.reduce_scatter_handlers
+                    # if not hasattr(_param, "_fstp_all_reduce_str"):
+                    #     continue
+
+                    # key = getattr(_param, "_fstp_all_reduce_str")
+                    # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key]
+                    # comm_handle.wait()
+                    # with torch.no_grad():
+                    #     _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0)
+                    # _param.grad.add_(_grad)
+                    # # self._fstp_handler.reduce_scatter_handlers[key] = None
+                    # del _grad
+                    # del self._fstp_handler.all_reduce_handlers[key]
+                    # self._fstp_handler.all_reduce_handlers[key] = None
+                    # assert key in self._fstp_handler.all_reduce_handlers
 
                 bucket.reset_by_rank(rank)
-
-    def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
+                
+    def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None):
         param_size = param.numel()
 
         # check if the bucket is full
         # if full, will reduce the grads already in the bucket
         # after reduction, the bucket will be empty
         group_id = getattr(param, "group_id")
-        current_bucket = self._bucket_store[group_id]
+        current_bucket = self._bucket_store_2[group_id]
 
-        if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+        if current_bucket.num_elements_in_bucket(reduce_rank) >= 512 * 1024 * 1024:
             # wait reduce scatter communication
             params = current_bucket.get_param(reduce_rank)
             for _param in params:
@@ -366,18 +393,48 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
                 key = getattr(_param, "_fstp_reduce_scatter_str")
                 comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
                 comm_handle.wait()
-                _param.grad += _grad
+                _param.grad.add_(_grad)
+                # self._fstp_handler.reduce_scatter_handlers[key] = None
+                del _grad
+                del self._fstp_handler.reduce_scatter_handlers[key]
                 self._fstp_handler.reduce_scatter_handlers[key] = None
+                assert key in self._fstp_handler.reduce_scatter_handlers
+                
+                # if not hasattr(_param, "_fstp_all_reduce_str"):
+                #         continue
+
+                # key = getattr(_param, "_fstp_all_reduce_str")
+                # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key]
+                # comm_handle.wait()
+                # with torch.no_grad():
+                #     _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0)
+                # _param.grad.add_(_grad)
+                # # self._fstp_handler.reduce_scatter_handlers[key] = None
+                # del _grad
+                # del self._fstp_handler.all_reduce_handlers[key]
+                # self._fstp_handler.all_reduce_handlers[key] = None
+                # assert key in self._fstp_handler.all_reduce_handlers
 
-            # reduce grad
-            if self.skip_grad_reduce is False:
-                self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False)
-            else:
                 current_bucket.reset_by_rank(reduce_rank)
+                
+        current_bucket.add_num_elements_in_bucket(param_size, reduce_rank)
+        current_bucket.add_param(param, reduce_rank)
+
+    def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
+        param_size = param.numel()
+
+        # check if the bucket is full
+        # if full, will reduce the grads already in the bucket
+        # after reduction, the bucket will be empty
+        group_id = getattr(param, "group_id")
+        current_bucket = self._bucket_store[group_id]
+
+        if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+            self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False)
 
         # the param must not be reduced to ensure correctness
         is_param_reduced = self._param_store.is_param_reduced(param)
-        if is_param_reduced and self.skip_grad_reduce is False:
+        if is_param_reduced:
             msg = (
                 f"Parameter of size ({param.size()}) has already been reduced, "
                 + "duplicate reduction will lead to arithmetic incorrectness"
@@ -628,8 +685,15 @@ def step(self, closure=None):
         timer("sync_grad").stop()
         
         print_memory("No 4")
-
-        return self._step(closure=closure, norms=total_norms)
+        
+        try:
+            res =  self._step(closure=closure, norms=total_norms)
+        except torch.cuda.OutOfMemoryError as e:
+            print(e, flush=True)
+            print(torch.cuda.memory_summary(), flush=True)
+            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+            
+        return res
 
     def _step(self, closure=None, norms=None):
         assert closure is None, "closure is not supported by step()"
diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py
index 33380eb4..228045ed 100644
--- a/internlm/solver/optimizer/store.py
+++ b/internlm/solver/optimizer/store.py
@@ -45,6 +45,9 @@ def __init__(self, group_id, dp_parallel_mode):
 
     def num_elements_in_bucket(self, reduce_rank: int = None):
         return self._num_elements_in_bucket[reduce_rank]
+    
+    def num_params_in_bucket(self, reduce_rank: int = None):
+        return len(self._params[reduce_rank])
 
     def get_param_group_id(self):
         return self._group_id
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 93903a38..f39e3845 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -108,12 +108,45 @@ def initialize_model():
 
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
+    
+    gpc.config.fstp_handler = None
 
-    if gpc.config.parallel["tensor"]["mode"] == "fstp":
+    if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
         handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         handler._register_sync_parameters_hook()
         gpc.config.fstp_handler = handler
+        
+        # allocate memory pool
+        block_memory = {} # containing two groups of block weight
+        hidden_size = gpc.config.HIDDEN_SIZE
+        mlp_ratio = gpc.config.MLP_RATIO
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
+        size_key = [(3 * hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (hidden_size, hidden_size)]
+        module_name = ['Wqkv', 'out_proj', 'w1', 'w2', 'w3']
+        for i in range(2):
+            weight = {}
+            for name in module_name:
+                if name == 'Wqkv':
+                    weight[name] = torch.zeros((3 * hidden_size, hidden_size), 
+                                               dtype=gpc.config.model.get("dtype", torch.half), 
+                                               device='cuda').contiguous()
+                elif name == 'out_proj':
+                    weight[name] = torch.zeros((hidden_size, hidden_size), 
+                                               dtype=gpc.config.model.get("dtype", torch.half), 
+                                               device='cuda').contiguous()
+                elif name == 'w1' or name == 'w2':
+                    weight[name] = torch.zeros((mlp_hidden_size, hidden_size), 
+                                               dtype=gpc.config.model.get("dtype", torch.half), 
+                                               device='cuda').contiguous()
+                else:
+                    weight[name] = torch.zeros((hidden_size, mlp_hidden_size), 
+                                               dtype=gpc.config.model.get("dtype", torch.half), 
+                                               device='cuda').contiguous()
+            block_memory[i] = weight
+        gpc.config.block_memory = block_memory
+
     return model
 
 
@@ -393,6 +426,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
         ),
         with_stack=True,
         with_modules=True,
+        profile_memory=True,
     )
 
 
diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py
index 48877b90..52d96385 100644
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@@ -36,10 +36,17 @@ def empty_cache_and_diag(batch_count, interval=50):
         if batch_count > 0:
             if gpc.is_rank_for_log():
                 logger.info("Empty Cache and Diagnosis GPU/NCCL/Timer ...")
-            with torch.no_grad():
-                timer_diagnosis()
-                bench_gpu()
-                bench_net()
+            # with torch.no_grad():
+            #     try:
+            #         timer_diagnosis()
+            #         bench_gpu()
+            #         bench_net()
+            #     except torch.distributed.DistBackendError as e:
+            #         # import time
+            #         # time.sleep(10)
+            #         print(e, "rank = ", gpc.get_global_rank(), flush=True)
+                    # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+
         # do empty_cache after the bench
         torch.cuda.empty_cache()
         # do garbage collection
diff --git a/train.py b/train.py
index 0a84f592..c972bea9 100644
--- a/train.py
+++ b/train.py
@@ -195,6 +195,7 @@ def main(args):
         # start iterating the train data and begin training
         for batch_count in range(train_state.batch_count, total_steps):
             empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval)
+            torch.cuda.memory._record_memory_history()
             start_time = time.time()
             timer("one-batch").start()
 
@@ -294,9 +295,11 @@ def main(args):
             if memory_profiler is not None:
                 memory_profiler.step()
 
-            if batch_count % 2 == 0:
-                prof.step()
+            prof.step()
             
+            if gpc.config.fstp_handler is not None:
+                gpc.config.fstp_handler.zero_const_pool = {}
+            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 
     ckpt_manager.wait_async_upload_finish()

From ed7232777a0214d7ee605872477eea3e25521c53 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Fri, 20 Oct 2023 10:35:45 +0800
Subject: [PATCH 034/153] support reduce scatter memory pool

---
 configs/20B_sft.py                            |  2 +-
 configs/30B_sft.py                            |  8 +--
 configs/7B_sft.py                             |  2 +-
 internlm/model/utils.py                       | 52 ++++++++++++++++++-
 .../solver/optimizer/hybrid_zero_optim.py     |  8 +--
 internlm/train/training_internlm.py           | 18 ++++---
 train.py                                      |  1 +
 7 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/configs/20B_sft.py b/configs/20B_sft.py
index 5a9021be..13e68b22 100644
--- a/configs/20B_sft.py
+++ b/configs/20B_sft.py
@@ -57,7 +57,7 @@
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=False,
-    total_steps=50,
+    total_steps=20,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
diff --git a/configs/30B_sft.py b/configs/30B_sft.py
index ec040480..8bde0571 100644
--- a/configs/30B_sft.py
+++ b/configs/30B_sft.py
@@ -5,7 +5,7 @@
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3
-NUM_LAYER = 40
+NUM_LAYER = 60
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@@ -51,7 +51,7 @@
     # micro_num means the number of micro_batch contained in one gradient update
     micro_num=4,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=4,
+    micro_bsz=2,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -161,8 +161,8 @@
 sequence parallel (bool): enable/disable sequence parallel, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="origin_tp", overlap=False),
+    zero1=dict(size=4, fsdp=False),
+    tensor=dict(size=8, mode="fstp", overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 106548a2..6ea8b96e 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -162,7 +162,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="fstp"),
+    tensor=dict(size=8, mode="fstp", overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 5b4018c8..2667efed 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -14,6 +14,7 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
+from internlm.utils.common import get_current_device
 
 logger = get_logger(__file__)
 
@@ -148,6 +149,18 @@ def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bo
                                                      async_op=async_op)
     return output, handle
 
+def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+    world_size = torch.distributed.get_world_size(process_group)
+    assert input_.shape[0] % world_size == 0
+    size = (input_.shape[0] // world_size, *input_.shape[1:])
+    index = check_reduce_scatter_memory_pool(size)
+    output = gpc.config.reduce_scatter_memory[size]['data'][index]
+    setattr(output, "index", index)
+    handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(),
+                                                     group=process_group,
+                                                     async_op=async_op)
+    return output, handle
+
 
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFunc(torch.autograd.Function):
@@ -404,12 +417,13 @@ def backward(ctx, grad_output, *args):
                     #     assert hasattr(bias, "_fstp_all_reduce_str")
                     #     all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async)
                     #     grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
-                    grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                    
+                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True)
                     assert hasattr(weight, "_fstp_reduce_scatter_str")
                     all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
                     grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
                     if grad_bias is not None:
-                        grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                        grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(grad_bias, process_group, async_op=True)
                         assert hasattr(bias, "_fstp_reduce_scatter_str")
                         all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
                         grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
@@ -521,3 +535,37 @@ def Silu(w1_o, w2_o):
 
 
 Silu = torch.jit.script(Silu)
+
+def check_reduce_scatter_memory_pool(key):
+    
+    return_idx = 0
+    
+    # if key not in dict
+    if key not in gpc.config.reduce_scatter_memory:
+        gpc.config.reduce_scatter_memory[key] = {'data': [], 'used': []}
+    
+    # if the data is empty
+    if len(gpc.config.reduce_scatter_memory[key]['data']) == 0:
+        gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, 
+                                                             dtype=gpc.config.model.get("dtype", torch.half), 
+                                                             device=get_current_device()).contiguous())
+        gpc.config.reduce_scatter_memory[key]['used'].append(True)
+        return_idx = 0
+        return return_idx
+    else: # if not empty
+        for index, used in enumerate(gpc.config.reduce_scatter_memory[key]['used']):
+            if used == False:
+                gpc.config.reduce_scatter_memory[key]['used'][index] = True
+                return_idx = index
+                return return_idx
+        # if the memory pool is all used
+        length = len(gpc.config.reduce_scatter_memory[key]['data'])
+        gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, 
+                                                             dtype=gpc.config.model.get("dtype", torch.half), 
+                                                             device=get_current_device()).contiguous())
+        gpc.config.reduce_scatter_memory[key]['used'].append(True)
+        return_idx = length
+        return return_idx
+
+def release_reduce_scatter_memory_pool(size, index):
+    gpc.config.reduce_scatter_memory[size]['used'][index] = False
\ No newline at end of file
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index d0cdd101..96a54c01 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -10,7 +10,7 @@
 
 from internlm.core.context import Config, ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import split_forward_gather_backward
+from internlm.model.utils import split_forward_gather_backward, release_reduce_scatter_memory_pool
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
     BucketStore,
@@ -353,7 +353,8 @@ def reset_reduce_bucket(self) -> None:
                     comm_handle.wait()
                     _param.grad.add_(_grad)
                     # self._fstp_handler.reduce_scatter_handlers[key] = None
-                    del _grad
+                    # del _grad
+                    release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
                     del self._fstp_handler.reduce_scatter_handlers[key]
                     self._fstp_handler.reduce_scatter_handlers[key] = None
                     assert key in self._fstp_handler.reduce_scatter_handlers
@@ -395,7 +396,8 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None):
                 comm_handle.wait()
                 _param.grad.add_(_grad)
                 # self._fstp_handler.reduce_scatter_handlers[key] = None
-                del _grad
+                # del _grad
+                release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
                 del self._fstp_handler.reduce_scatter_handlers[key]
                 self._fstp_handler.reduce_scatter_handlers[key] = None
                 assert key in self._fstp_handler.reduce_scatter_handlers
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index f39e3845..2816da0e 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -51,7 +51,7 @@
 from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
-from internlm.utils.common import DummyProfile
+from internlm.utils.common import DummyProfile, get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import sync_model_param, sync_model_param_within_tp
@@ -123,7 +123,8 @@ def initialize_model():
         mlp_ratio = gpc.config.MLP_RATIO
         mlp_hidden_size = int(hidden_size * mlp_ratio)
         mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
-        size_key = [(3 * hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (hidden_size, hidden_size)]
+        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        size_key = [(3 * hidden_size // world_size, hidden_size), (mlp_hidden_size // world_size, hidden_size), (hidden_size // world_size, mlp_hidden_size), (hidden_size // world_size, hidden_size)]
         module_name = ['Wqkv', 'out_proj', 'w1', 'w2', 'w3']
         for i in range(2):
             weight = {}
@@ -131,21 +132,26 @@ def initialize_model():
                 if name == 'Wqkv':
                     weight[name] = torch.zeros((3 * hidden_size, hidden_size), 
                                                dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
                 elif name == 'out_proj':
                     weight[name] = torch.zeros((hidden_size, hidden_size), 
                                                dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
                 elif name == 'w1' or name == 'w2':
                     weight[name] = torch.zeros((mlp_hidden_size, hidden_size), 
                                                dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
                 else:
                     weight[name] = torch.zeros((hidden_size, mlp_hidden_size), 
                                                dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
             block_memory[i] = weight
+        reduce_scatter_memory = {}
+        for key in size_key:
+            reduce_scatter_memory[key] = {'data': [], 'used': []}
+        
         gpc.config.block_memory = block_memory
+        gpc.config.reduce_scatter_memory = reduce_scatter_memory
 
     return model
 
diff --git a/train.py b/train.py
index c972bea9..41ab070d 100644
--- a/train.py
+++ b/train.py
@@ -299,6 +299,7 @@ def main(args):
             
             if gpc.config.fstp_handler is not None:
                 gpc.config.fstp_handler.zero_const_pool = {}
+                gpc.config.fstp_handler.reduce_scatter_memory = {}
             torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 

From 815a584930622d6c9c81508d41132a6413c86420 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 20 Oct 2023 11:27:59 +0800
Subject: [PATCH 035/153] feat(model/linear.py): remove useless code

---
 internlm/model/linear.py               | 307 +++----------------------
 internlm/model/modeling_internlm.py    |   3 -
 internlm/model/multi_head_attention.py |   1 -
 internlm/model/utils.py                | 152 +++++++-----
 internlm/train/training_internlm.py    |  58 +++--
 train.py                               |   2 +-
 6 files changed, 166 insertions(+), 357 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 4f05cd32..61a5cfc1 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -177,7 +177,6 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
-        block_idx: int = 0,
     ):
         super().__init__()
 
@@ -224,8 +223,14 @@ def forward(self, x):
         name_index = gpc.config.fstp_handler.module_name_index[self]
         name = gpc.config.fstp_handler.module_name[name_index]
         return fstp_fused_dense_func(
-            x, self.weight, self.bias, process_group=self.process_group, 
-            module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name
+            x,
+            self.weight,
+            self.bias,
+            process_group=self.process_group,
+            module=self,
+            handler=gpc.config.fstp_handler,
+            block_index=block_index,
+            module_name=name,
         )
 
 
@@ -255,7 +260,6 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
-        block_idx: int = 0,
     ):
         super().__init__()
 
@@ -296,129 +300,6 @@ def forward(self, x):
         return out
 
 
-class FSTPAllGatherSyncHandler:
-    """
-    All-gather handler for overlapping the all-gather in adjcent FSTP linear.
-    """
-
-    def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
-        # import pdb; pdb.set_trace()
-        self.process_group = process_group
-        self.FSTP_modules = []
-        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
-        self.FSTP_global_weights = dict()  # key: FSTP module; value: module global weight for forward
-        self.module_handler = dict()  # key: FSTP module; value: all-gather handler
-        self.module_block = dict()  # key: FSTP module; value: transformer block index
-        self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
-        self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
-
-        self.reduce_scatter_handlers = {}
-        self.all_reduce_handlers = {}
-
-        # just want to share same for loop for ModuleList and Module
-        if not isinstance(model, nn.ModuleList):
-            model = [model]
-
-        for _chunk in model:
-            if isinstance(_chunk, NaiveAMPModel):
-                _chunk = _chunk.model
-
-            for _chunk_name, children in _chunk.named_children():
-                if isinstance(children, nn.ModuleList):
-                    for idx, block in enumerate(children):
-                        index = 0
-                        self.block_module[idx] = {}
-                        for _sub_name, sub in block.named_children():
-                            sub_modules = list(sub.children())
-                            if len(sub_modules) > 0:
-                                for name, child in sub.named_children():
-                                    if isinstance(child, FSTPLinear):
-
-                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
-                                        if child.bias is not None:
-                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
-
-                                        self.FSTP_modules.append(child)
-                                        self.module_block[child] = idx
-                                        self.block_module[idx][index] = child
-                                        self.module_name_index[child] = index
-                                        index = index + 1
-                            else:
-                                continue
-
-    def _register_sync_parameters_hook(self) -> None:
-        """
-        register pre_forward_hook and pre_backward_hook for FSTPLinear.
-        """
-
-        def _pre_forward_hook(module: nn.Module, inputs: Any):
-            block_index = self.module_block[module]
-            name_index = self.module_name_index[module]
-            if name_index == 0:
-                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                weight_handler.wait()
-                self.FSTP_global_weights[module] = total_weight
-
-                # start the all-gather for next module
-                next_module = self.block_module[block_index][name_index + 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                    next_module.weight, self.process_group, async_op=True
-                )
-                self.module_handler[next_module] = weights_handler
-            else:
-                handler = self.module_handler[module]
-                handler.wait()
-                if name_index != 4:
-                    next_module = self.block_module[block_index][name_index + 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                        next_module.weight, self.process_group, async_op=True
-                    )
-                    self.module_handler[next_module] = weights_handler
-
-        def _post_forward_hook(module: nn.Module, input, output):
-            if module in self.FSTP_global_weights:
-                del self.FSTP_global_weights[module]
-            if module in self.module_handler:
-                del self.module_handler[module]
-
-        def _pre_backward_hook(module: nn.Module, grad_output):
-            block_index = self.module_block[module]
-            name_index = self.module_name_index[module]
-            if name_index == 4:
-                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                weight_handler.wait()
-                self.FSTP_global_weights[module] = total_weight
-
-                # start the all-gather for next module
-                next_module = self.block_module[block_index][name_index - 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                    next_module.weight, self.process_group, async_op=True
-                )
-                self.module_handler[next_module] = weights_handler
-            else:
-                handler = self.module_handler[module]
-                handler.wait()
-                if name_index != 0:
-                    next_module = self.block_module[block_index][name_index - 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                        next_module.weight, self.process_group, async_op=True
-                    )
-                    self.module_handler[next_module] = weights_handler
-
-        def _post_backward_hook(module, grad_input, grad_output):
-            del self.FSTP_global_weights[module]
-
-        for module in self.FSTP_modules:
-            # import pdb; pdb.set_trace()
-            module.register_forward_pre_hook(_pre_forward_hook)
-            module.register_forward_hook(_post_forward_hook)
-            # module.register_backward_pre_hook(_pre_backward_hook)
-            # module.register_backward_hook(_post_backward_hook)
-            module.register_full_backward_pre_hook(_pre_backward_hook)
-            module.register_full_backward_hook(_post_backward_hook)
-
-
 class CoarseGrainedFSTPAllGatherSyncHandler:
     """
     All-gather handler for overlapping the all-gather in adjcent FSTP block.
@@ -479,49 +360,33 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                         self.index_to_fsdp_modules[idx].append(child)
                                         self.module_name_index[child] = index
                                         index = index + 1
-                                        
+
                                         _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
                                         setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
                                         if child.bias is not None:
                                             setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
-                                        # _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                        # setattr(child.weight, "_fstp_all_reduce_str", f"{_full_name}.weight")
-                                        # if child.bias is not None:
-                                        #     setattr(child.bias, "_fstp_all_reduce_str", f"{_full_name}.bias")
                             else:
                                 continue
                 elif isinstance(children, ScaleColumnParallelLinear):
                     self.head.append(children)
                 elif isinstance(children, Embedding1D):
                     self.embedding.append(children)
-                    
-    def get_zero_by_shape(self, size:tuple, dtype, device) -> torch.Tensor:
-        if size not in self.zero_const_pool:        
+
+    def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
+        if size not in self.zero_const_pool:
             self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
-        
-        return self.zero_const_pool[size]
 
+        return self.zero_const_pool[size]
 
-    def _all_gather_block_weight(self, block_index: int):
-        #block = self.index_to_block[block_index]
-        fsdp_modules = self.index_to_fsdp_modules[block_index]
-        # self.block_handles[block] = []
-        for module in fsdp_modules:
-            total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
-            self.FSTP_global_weights[module] = total_weight
-            self.FSTP_global_handle[module] = weight_handle
-            # self.block_handles[block].append(weight_handle)
-    
     def _all_gather_block_weight_memory_pool(self, block_index: int):
         fsdp_modules = self.index_to_fsdp_modules[block_index]
-        # self.block_handles[block] = []
         for module in fsdp_modules:
             module_index = self.module_name_index[module]
             name = self.module_name[module_index]
-            weight_handle = all_gather_raw_memory_pool(module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name)
-            # self.FSTP_global_weights[module] = total_weight
+            weight_handle = all_gather_raw_memory_pool(
+                module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name
+            )
             self.FSTP_global_handle[module] = weight_handle
-            # self.block_handles[block].append(weight_handle)
 
     def _register_sync_parameters_hook(self) -> None:
         """
@@ -538,41 +403,14 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
             block_index = self.module_to_index[module]
             # start the all-gather for next block
             if block_index + 1 < gpc.config.NUM_LAYER:
-                # self._all_gather_block_weight(block_index + 1)
                 self._all_gather_block_weight_memory_pool(block_index + 1)
 
-        def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):
-            block_index = self.block_to_index[block]
-            if block_index == 0:
-                # all gather weight for block 0
-                fsdp_modules = self.index_to_fsdp_modules[block_index]
-                for module in fsdp_modules:
-                    total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
-                    weight_handle.wait()
-                    self.FSTP_global_weights[module] = total_weight
-            else:
-                # wait handle for current block
-                handles = self.block_handles[block]
-                for handle in handles:
-                    handle.wait()
-        
-        def _pre_forward_hook_for_embedding(module: nn.Module, inputs: Any, output):
-            # self._all_gather_block_weight(0)
+        def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output):
             self._all_gather_block_weight_memory_pool(0)
-            
-
-        def _post_forward_hook_for_block(block: nn.Module, input, output):
-            block_index = self.block_to_index[block]
-            fsdp_modules = self.index_to_fsdp_modules[block_index]
-            if block in self.block_handles:
-                del self.block_handles[block]
-            for module in fsdp_modules:
-                del self.FSTP_global_weights[module]
 
-        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any,):
-            block_index = self.module_to_index[module]
-            handler = self.FSTP_global_handle[module]
-            handler.wait()
+        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
+            handle = self.FSTP_global_handle[module]
+            handle.wait()
 
         def _post_forward_hook_for_module(module: nn.Module, input, output):
             if module in self.FSTP_global_weights:
@@ -580,67 +418,44 @@ def _post_forward_hook_for_module(module: nn.Module, input, output):
             if module in self.FSTP_global_handle:
                 del self.FSTP_global_handle[module]
 
-        def _pre_backward_hook_for_block(block: nn.Module, grad_output):
-            # import pdb; pdb.set_trace()
-            block_index = self.block_to_index[block]
-            # if block_index == gpc.config.NUM_LAYER - 1:
-            #     # all gather weight for the last block
-            #     fsdp_modules = self.index_to_fsdp_modules[block_index]
-            #     for module in fsdp_modules:
-            #         total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True)
-            #         weight_handle.wait()
-            #         self.FSTP_global_weights[module] = total_weight
-            # else:
-            #     # wait handle for current block
-            #     handles = self.block_handles[block]
-            #     for handle in handles:
-            #         handle.wait()
-            # if block_index == gpc.config.NUM_LAYER - 1:
-            #     self._all_gather_block_weight(block_index)
-            # start the all-gather for next block
-            if block_index - 1 >= 0:
-                self._all_gather_block_weight(block_index - 1)
-        
         def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
             first_module = self.block_module[gpc.config.NUM_LAYER - 1][4]
             total_weight, weight_handler = all_gather_raw(first_module.weight, self.process_group, async_op=True)
             self.FSTP_global_handle[first_module] = weight_handler
             self.FSTP_global_weights[first_module] = total_weight
 
-        def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output):
-            block_index = self.block_to_index[block]
-            fsdp_modules = self.index_to_fsdp_modules[block_index]
-            if block in self.block_handles:
-                del self.block_handles[block]
-            for module in fsdp_modules:
-                del self.FSTP_global_weights[module]
-
         def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output):
             block_index = self.module_to_index[module]
             name_index = self.module_name_index[module]
-            
+
             if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
-                # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
                 weight_handler = self.FSTP_global_handle[module]
                 weight_handler.wait()
-                # self.FSTP_global_weights[module] = total_weight
 
                 # start the all-gather for next module
                 next_module = self.block_module[block_index][name_index - 1]
                 next_name = self.module_name[name_index - 1]
                 weights_handler = all_gather_raw_memory_pool(
-                    next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=next_name
+                    next_module.weight,
+                    self.process_group,
+                    async_op=True,
+                    block_index=block_index,
+                    module_name=next_name,
                 )
                 self.FSTP_global_handle[next_module] = weights_handler
             elif name_index == 0:
                 handler = self.FSTP_global_handle[module]
                 handler.wait()
-                
+
                 if block_index - 1 >= 0:
                     next_module = self.block_module[block_index - 1][4]
                     name = self.module_name[4]
                     weights_handler = all_gather_raw_memory_pool(
-                        next_module.weight, self.process_group, async_op=True, block_index=block_index - 1, module_name=name,
+                        next_module.weight,
+                        self.process_group,
+                        async_op=True,
+                        block_index=block_index - 1,
+                        module_name=name,
                     )
                     self.FSTP_global_handle[next_module] = weights_handler
             else:
@@ -653,76 +468,24 @@ def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output):
                         next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name
                     )
                     self.FSTP_global_handle[next_module] = weights_handler
-                # if module in self.FSTP_global_handle:
-                #     handler = self.FSTP_global_handle[module]
-                #     handler.wait()
-
-        def _pre_backward_hook_for_module(module: nn.Module, grad_output):
-            block_index = self.module_to_index[module]
-            name_index = self.module_name_index[module]
-            
-            if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
-                # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                weight_handler = self.FSTP_global_handle[module]
-                weight_handler.wait()
-                # self.FSTP_global_weights[module] = total_weight
-
-                # start the all-gather for next module
-                next_module = self.block_module[block_index][name_index - 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                    next_module.weight, self.process_group, async_op=True
-                )
-                self.FSTP_global_handle[next_module] = weights_handler
-            elif name_index == 0:
-                handler = self.FSTP_global_handle[module]
-                handler.wait()
-                
-                if block_index - 1 >= 0:
-                    next_module = self.block_module[block_index - 1][4]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                        next_module.weight, self.process_group, async_op=True
-                    )
-                    self.FSTP_global_handle[next_module] = weights_handler
-            else:
-                handler = self.FSTP_global_handle[module]
-                handler.wait()
-                if name_index != 0:
-                    next_module = self.block_module[block_index][name_index - 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                        next_module.weight, self.process_group, async_op=True
-                    )
-                    self.FSTP_global_handle[next_module] = weights_handler
-                # if module in self.FSTP_global_handle:
-                #     handler = self.FSTP_global_handle[module]
-                #     handler.wait()
 
         def _post_backward_hook_for_module(module, grad_input, grad_output):
             if module in self.FSTP_global_weights:
                 del self.FSTP_global_weights[module]
             if module in self.FSTP_global_handle:
                 del self.FSTP_global_handle[module]
-        
+
         for embedding in self.embedding:
-            embedding.register_forward_hook(_pre_forward_hook_for_embedding)
-        
+            embedding.register_forward_hook(_post_forward_hook_for_embedding)
+
         for head in self.head:
             head.register_full_backward_hook(_post_backward_hook_for_head)
 
-        # for block in self.FSTP_blocks:
-            # block.register_forward_pre_hook(_pre_forward_hook_for_block)
-            # block.register_forward_hook(_post_forward_hook_for_block)
-            # block.register_full_backward_pre_hook(_pre_backward_hook_for_block)
-            # block.register_full_backward_hook(_post_backward_hook_for_block)
-
         for out_proj in self.FSTP_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
-   
-        # for wqkv in self.FSTP_wqkvs:
-        #     wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv)
 
         for module in self.FSTP_modules:
             module.register_forward_pre_hook(_pre_forward_hook_for_module)
             module.register_forward_hook(_post_forward_hook_for_module)
-            # module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
             module.register_full_backward_pre_hook(_pre_backward_hook_for_module_memory_pool)
             module.register_full_backward_hook(_post_backward_hook_for_module)
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index b004dffa..0df2b60e 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -78,7 +78,6 @@ def __init__(
         use_swiglu: bool = True,
         use_flash_attn: bool = True,
         tp_mode: str = "origin_tp",
-        block_idx: int = 0,
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -104,7 +103,6 @@ def __init__(
             device=device,
             dtype=dtype,
             tp_mode=tp_mode,
-            block_idx=block_idx,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -346,7 +344,6 @@ def __init__(
                     use_swiglu=use_swiglu,
                     use_flash_attn=use_flash_attn,
                     tp_mode=self.tp_mode,
-                    block_idx=lid,
                 )
                 for lid in range(num_layers)
             ]
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 7a0f4ed7..8dcd3f96 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -176,7 +176,6 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         tp_mode: str = "origin_tp",
-        block_idx: int = 0,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 2667efed..b9c7c03a 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -6,15 +6,15 @@
 import fused_dense_lib as fused_dense_cuda
 import torch
 import torch.nn.functional as F
-from flash_attn.utils.distributed import all_reduce_raw #, reduce_scatter_raw
+from flash_attn.utils.distributed import all_reduce_raw  # , reduce_scatter_raw
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.utils.logger import get_logger
 from internlm.utils.common import get_current_device
+from internlm.utils.logger import get_logger
 
 logger = get_logger(__file__)
 
@@ -125,9 +125,20 @@ def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool =
     )
     return output, handle
 
-def all_gather_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0, block_index: int = None, module_name: str = None):
+
+def all_gather_raw_memory_pool(
+    input_: Tensor,
+    process_group: ProcessGroup,
+    async_op: bool = False,
+    gather_dim: int = 0,
+    block_index: int = None,
+    module_name: str = None,
+):
     handle = torch.distributed.all_gather_into_tensor(
-        gpc.config.block_memory[block_index % 2][module_name], input_.contiguous(), group=process_group, async_op=async_op
+        gpc.config.block_memory[block_index % 2][module_name],
+        input_.contiguous(),
+        group=process_group,
+        async_op=async_op,
     )
     return handle
 
@@ -142,23 +153,25 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
 def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
-    output = torch.empty(input_.shape[0] // world_size, *input_.shape[1:],
-                         dtype=input_.dtype, device=input_.device).contiguous()
-    handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(),
-                                                     group=process_group,
-                                                     async_op=async_op)
+    output = torch.empty(
+        input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
+    ).contiguous()
+    handle = torch.distributed.reduce_scatter_tensor(
+        output, input_.contiguous(), group=process_group, async_op=async_op
+    )
     return output, handle
 
+
 def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
     size = (input_.shape[0] // world_size, *input_.shape[1:])
     index = check_reduce_scatter_memory_pool(size)
-    output = gpc.config.reduce_scatter_memory[size]['data'][index]
+    output = gpc.config.reduce_scatter_memory[size]["data"][index]
     setattr(output, "index", index)
-    handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(),
-                                                     group=process_group,
-                                                     async_op=async_op)
+    handle = torch.distributed.reduce_scatter_tensor(
+        output, input_.contiguous(), group=process_group, async_op=async_op
+    )
     return output, handle
 
 
@@ -313,7 +326,18 @@ class FSTPFusedDenseFunc(torch.autograd.Function):
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None, block_index=None, module_name=None):
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        return_residual=False,
+        process_group=None,
+        module=None,
+        all_gather_handler=None,
+        block_index=None,
+        module_name=None,
+    ):
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
         ctx.process_group = process_group
@@ -329,9 +353,9 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
-            if all_gather_handler is not None:# and module in all_gather_handler.FSTP_global_weights:
-                    # total_weight = all_gather_handler.FSTP_global_weights[module]
-                    total_weight = gpc.config.block_memory[block_index % 2][module_name]   
+            if all_gather_handler is not None:  # and module in all_gather_handler.FSTP_global_weights:
+                # total_weight = all_gather_handler.FSTP_global_weights[module]
+                total_weight = gpc.config.block_memory[block_index % 2][module_name]
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
@@ -376,7 +400,7 @@ def backward(ctx, grad_output, *args):
         module = ctx.module
         block_index = ctx.block_index
         module_name = ctx.module_name
-        
+
         if ctx.compute_weight_gradient:
             x, weight, bias = ctx.saved_tensors
             total_x = x
@@ -408,32 +432,43 @@ def backward(ctx, grad_output, *args):
             )
             if world_size > 1:
                 if gpc.config.fstp_handler is not None:
-                    # grad_weight_async, handle_grad_weight = all_reduce_raw(grad_weight, process_group, async_op=True)
-                    # assert hasattr(weight, "_fstp_all_reduce_str")
-                    # all_gather_handler.all_reduce_handlers[weight._fstp_all_reduce_str] = (handle_grad_weight, grad_weight_async)
-                    # grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
-                    # if grad_bias is not None:
-                    #     grad_bias_async, handle_grad_bias = all_reduce_raw(grad_bias, process_group, async_op=True)
-                    #     assert hasattr(bias, "_fstp_all_reduce_str")
-                    #     all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async)
-                    #     grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
-                    
-                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True)
+                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
+                        grad_weight, process_group, async_op=True
+                    )
                     assert hasattr(weight, "_fstp_reduce_scatter_str")
-                    all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
-                    grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
+                    all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (
+                        handle_grad_weight,
+                        grad_weight_async,
+                    )
+                    grad_weight = all_gather_handler.get_zero_by_shape(
+                        (
+                            grad_weight.shape[0] // torch.distributed.get_world_size(process_group),
+                            *grad_weight.shape[1:],
+                        ),
+                        dtype=grad_weight.dtype,
+                        device=grad_weight.device,
+                    )
                     if grad_bias is not None:
-                        grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(grad_bias, process_group, async_op=True)
+                        grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(
+                            grad_bias, process_group, async_op=True
+                        )
                         assert hasattr(bias, "_fstp_reduce_scatter_str")
-                        all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
-                        grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
+                        all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (
+                            handle_grad_bias,
+                            grad_bias_async,
+                        )
+                        grad_bias = all_gather_handler.get_zero_by_shape(
+                            (
+                                grad_bias.shape[0] // torch.distributed.get_world_size(process_group),
+                                *grad_bias.shape[1:],
+                            ),
+                            dtype=grad_bias.dtype,
+                            device=grad_bias.device,
+                        )
                 else:
                     grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
                     if grad_bias is not None:
                         grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
-                # grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
-                # if grad_bias is not None:
-                #     grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
@@ -489,7 +524,9 @@ def fstp_fused_dense_func(
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler, block_index, module_name)
+        return FSTPFusedDenseFunc.apply(
+            x, weight, bias, return_residual, process_group, module, handler, block_index, module_name
+        )
     else:
         assert process_group is None
         out = F.linear(x, weight, bias)
@@ -536,36 +573,37 @@ def Silu(w1_o, w2_o):
 
 Silu = torch.jit.script(Silu)
 
+
 def check_reduce_scatter_memory_pool(key):
-    
     return_idx = 0
-    
+
     # if key not in dict
     if key not in gpc.config.reduce_scatter_memory:
-        gpc.config.reduce_scatter_memory[key] = {'data': [], 'used': []}
-    
+        gpc.config.reduce_scatter_memory[key] = {"data": [], "used": []}
+
     # if the data is empty
-    if len(gpc.config.reduce_scatter_memory[key]['data']) == 0:
-        gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, 
-                                                             dtype=gpc.config.model.get("dtype", torch.half), 
-                                                             device=get_current_device()).contiguous())
-        gpc.config.reduce_scatter_memory[key]['used'].append(True)
+    if len(gpc.config.reduce_scatter_memory[key]["data"]) == 0:
+        gpc.config.reduce_scatter_memory[key]["data"].append(
+            torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous()
+        )
+        gpc.config.reduce_scatter_memory[key]["used"].append(True)
         return_idx = 0
         return return_idx
-    else: # if not empty
-        for index, used in enumerate(gpc.config.reduce_scatter_memory[key]['used']):
-            if used == False:
-                gpc.config.reduce_scatter_memory[key]['used'][index] = True
+    else:  # if not empty
+        for index, used in enumerate(gpc.config.reduce_scatter_memory[key]["used"]):
+            if used is False:
+                gpc.config.reduce_scatter_memory[key]["used"][index] = True
                 return_idx = index
                 return return_idx
         # if the memory pool is all used
-        length = len(gpc.config.reduce_scatter_memory[key]['data'])
-        gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, 
-                                                             dtype=gpc.config.model.get("dtype", torch.half), 
-                                                             device=get_current_device()).contiguous())
-        gpc.config.reduce_scatter_memory[key]['used'].append(True)
+        length = len(gpc.config.reduce_scatter_memory[key]["data"])
+        gpc.config.reduce_scatter_memory[key]["data"].append(
+            torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous()
+        )
+        gpc.config.reduce_scatter_memory[key]["used"].append(True)
         return_idx = length
         return return_idx
 
+
 def release_reduce_scatter_memory_pool(size, index):
-    gpc.config.reduce_scatter_memory[size]['used'][index] = False
\ No newline at end of file
+    gpc.config.reduce_scatter_memory[size]["used"][index] = False
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2816da0e..5205ba5b 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -38,7 +38,6 @@
 from internlm.model.linear import (
     CoarseGrainedFSTPAllGatherSyncHandler,
     FeedForward,
-    FSTPAllGatherSyncHandler,
     RewardModelLinear,
     ScaleColumnParallelLinear,
 )
@@ -108,7 +107,7 @@ def initialize_model():
 
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
-    
+
     gpc.config.fstp_handler = None
 
     if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
@@ -116,40 +115,53 @@ def initialize_model():
         # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         handler._register_sync_parameters_hook()
         gpc.config.fstp_handler = handler
-        
+
         # allocate memory pool
-        block_memory = {} # containing two groups of block weight
+        block_memory = {}  # containing two groups of block weight
         hidden_size = gpc.config.HIDDEN_SIZE
         mlp_ratio = gpc.config.MLP_RATIO
         mlp_hidden_size = int(hidden_size * mlp_ratio)
         mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
-        size_key = [(3 * hidden_size // world_size, hidden_size), (mlp_hidden_size // world_size, hidden_size), (hidden_size // world_size, mlp_hidden_size), (hidden_size // world_size, hidden_size)]
-        module_name = ['Wqkv', 'out_proj', 'w1', 'w2', 'w3']
+        size_key = [
+            (3 * hidden_size // world_size, hidden_size),
+            (mlp_hidden_size // world_size, hidden_size),
+            (hidden_size // world_size, mlp_hidden_size),
+            (hidden_size // world_size, hidden_size),
+        ]
+        module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
         for i in range(2):
             weight = {}
             for name in module_name:
-                if name == 'Wqkv':
-                    weight[name] = torch.zeros((3 * hidden_size, hidden_size), 
-                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device=get_current_device()).contiguous()
-                elif name == 'out_proj':
-                    weight[name] = torch.zeros((hidden_size, hidden_size), 
-                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device=get_current_device()).contiguous()
-                elif name == 'w1' or name == 'w2':
-                    weight[name] = torch.zeros((mlp_hidden_size, hidden_size), 
-                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device=get_current_device()).contiguous()
+                if name == "Wqkv":
+                    weight[name] = torch.zeros(
+                        (3 * hidden_size, hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
+                elif name == "out_proj":
+                    weight[name] = torch.zeros(
+                        (hidden_size, hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
+                elif name == "w1" or name == "w2":
+                    weight[name] = torch.zeros(
+                        (mlp_hidden_size, hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
                 else:
-                    weight[name] = torch.zeros((hidden_size, mlp_hidden_size), 
-                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device=get_current_device()).contiguous()
+                    weight[name] = torch.zeros(
+                        (hidden_size, mlp_hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
             block_memory[i] = weight
         reduce_scatter_memory = {}
         for key in size_key:
-            reduce_scatter_memory[key] = {'data': [], 'used': []}
-        
+            reduce_scatter_memory[key] = {"data": [], "used": []}
+
         gpc.config.block_memory = block_memory
         gpc.config.reduce_scatter_memory = reduce_scatter_memory
 
diff --git a/train.py b/train.py
index 41ab070d..19a104ba 100644
--- a/train.py
+++ b/train.py
@@ -296,7 +296,7 @@ def main(args):
                 memory_profiler.step()
 
             prof.step()
-            
+
             if gpc.config.fstp_handler is not None:
                 gpc.config.fstp_handler.zero_const_pool = {}
                 gpc.config.fstp_handler.reduce_scatter_memory = {}

From 95488d8e8f1737947c4f9a00f888d9f57e6ea606 Mon Sep 17 00:00:00 2001
From: "chenxun.p" <chenxun@senstime.com>
Date: Fri, 20 Oct 2023 15:58:06 +0800
Subject: [PATCH 036/153] update optimizer accumulate grad impl when fstp

---
 .../core/scheduler/no_pipeline_scheduler.py   |   1 -
 .../solver/optimizer/hybrid_zero_optim.py     | 139 +++++++-----------
 2 files changed, 54 insertions(+), 86 deletions(-)

diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index f0caf05c..56661d8c 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -194,7 +194,6 @@ def forward_backward_step(
             _output, _loss, _moe_loss = self._train_one_batch(
                 _data, _label, engine, forward_only, return_loss, self._grad_accum_size
             )
-            engine.optimizer.reset_reduce_bucket()
 
             if return_loss:
                 loss += _loss
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 96a54c01..2c14c65d 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -2,6 +2,7 @@
 # -*- encoding: utf-8 -*-
 
 import math
+from typing import Optional, List
 from functools import partial
 
 import torch
@@ -40,8 +41,20 @@
 inf = math.inf
 logger = get_logger(__file__)
 
+
 def print_memory(msg):
-    print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
+    print(
+        msg,
+        " rank = ",
+        gpc.get_global_rank(),
+        " memory allocated: ",
+        torch.cuda.memory_allocated() / 1024 / 1024 / 1024,
+        " reverved memory: ",
+        torch.cuda.memory_reserved() / 1024 / 1024 / 1024,
+        " max memory: ",
+        torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024,
+        flush=True,
+    )
     print("===========================================")
 
 
@@ -69,7 +82,7 @@ def __init__(
         backoff_factor = grad_scal_cfg.backoff_factor
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
-        
+
         if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
             self._fstp_handler = gpc.config.fstp_handler
 
@@ -90,8 +103,8 @@ def __init__(
         # it will not manage the tensors used by mixed precision training
         self._param_store = ParameterStore(ParallelMode.ZERO1)
         self._grad_store = GradientStore(ParallelMode.DATA)
-        self._bucket_store = []
-        self._bucket_store_2 = []
+        self._bucket_store: List[BucketStore] = []
+        self._accum_grad_buckets: List[BucketStore] = []
         self._bucket_in_progress = []
 
         # fp16 and fp32 params for mixed precision training
@@ -160,7 +173,7 @@ def __init__(
             # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
             self._broadcast_parallel_mode.append(zero_mode)
             self._bucket_store.append(BucketStore(group_id, param_group["dp_mode"]))
-            self._bucket_store_2.append(BucketStore(group_id, param_group["dp_mode"]))
+            self._accum_grad_buckets.append(BucketStore(group_id, param_group["dp_mode"]))
 
             # assign parameters to ranks the params in the list are sorted
             params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group)
@@ -306,9 +319,9 @@ def _define_and_attach(param, reduce_rank=None):
                             param=param,
                             reduce_rank=reduce_rank,
                         )
-                        
+
                         reduce_scatter_checker = partial(
-                            self._wait_reduce_scatter_and_accumulate_grad,
+                            self._wait_reduce_scatter_and_accumulate_grads,
                             param=param,
                             reduce_rank=reduce_rank,
                         )
@@ -317,7 +330,7 @@ def _define_and_attach(param, reduce_rank=None):
                         # NOT IMPORTANT BUT GOOD TO KNOW:
                         # args here is not grad, but allow_unreacable and accumulate_grad
                         def reduce_grad_hook(*args):  # pylint: disable=W0613
-                            if gpc.config.fstp_handler is not None:
+                            if self._fstp_handler is not None:
                                 reduce_scatter_checker()
 
                             if self.skip_grad_reduce is False:
@@ -341,84 +354,36 @@ def belongs_to_current_rank(self, param) -> bool:
         group_id = getattr(param, "group_id")
         return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id])
 
-    def reset_reduce_bucket(self) -> None:
-        for bucket in self._bucket_store_2:
-            for rank, params in bucket._params.items():
-                for _param in params:
-                    if not hasattr(_param, "_fstp_reduce_scatter_str"):
-                        continue
-
-                    key = getattr(_param, "_fstp_reduce_scatter_str")
-                    comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
-                    comm_handle.wait()
-                    _param.grad.add_(_grad)
-                    # self._fstp_handler.reduce_scatter_handlers[key] = None
-                    # del _grad
-                    release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
-                    del self._fstp_handler.reduce_scatter_handlers[key]
-                    self._fstp_handler.reduce_scatter_handlers[key] = None
-                    assert key in self._fstp_handler.reduce_scatter_handlers
-                    # if not hasattr(_param, "_fstp_all_reduce_str"):
-                    #     continue
-
-                    # key = getattr(_param, "_fstp_all_reduce_str")
-                    # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key]
-                    # comm_handle.wait()
-                    # with torch.no_grad():
-                    #     _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0)
-                    # _param.grad.add_(_grad)
-                    # # self._fstp_handler.reduce_scatter_handlers[key] = None
-                    # del _grad
-                    # del self._fstp_handler.all_reduce_handlers[key]
-                    # self._fstp_handler.all_reduce_handlers[key] = None
-                    # assert key in self._fstp_handler.all_reduce_handlers
-
-                bucket.reset_by_rank(rank)
-                
-    def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None):
+    def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None:
+        for _param in bucket.get_param(reduce_rank):
+            if not hasattr(_param, "_fstp_reduce_scatter_str"):
+                continue
+
+            # wait and accumulate gardient.
+            _key = getattr(_param, "_fstp_reduce_scatter_str")
+            _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key]
+            _comm_handle.wait()
+            _param.grad.add_(_grad)
+
+            # release cuda memory.
+            self._fstp_handler.reduce_scatter_handlers[_key] = None
+            _grad = None
+
+        bucket.reset_by_rank(reduce_rank)
+
+    def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional[int] = None):
         param_size = param.numel()
 
+        group_id = getattr(param, "group_id")
+        current_bucket = self._accum_grad_buckets[group_id]
+
         # check if the bucket is full
         # if full, will reduce the grads already in the bucket
         # after reduction, the bucket will be empty
-        group_id = getattr(param, "group_id")
-        current_bucket = self._bucket_store_2[group_id]
-
-        if current_bucket.num_elements_in_bucket(reduce_rank) >= 512 * 1024 * 1024:
-            # wait reduce scatter communication
-            params = current_bucket.get_param(reduce_rank)
-            for _param in params:
-                if not hasattr(_param, "_fstp_reduce_scatter_str"):
-                    continue
+        if current_bucket.num_elements_in_bucket(reduce_rank) >= self._reduce_bucket_size:
+            self._accum_grads_store_in_bucket(current_bucket, reduce_rank)
 
-                key = getattr(_param, "_fstp_reduce_scatter_str")
-                comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key]
-                comm_handle.wait()
-                _param.grad.add_(_grad)
-                # self._fstp_handler.reduce_scatter_handlers[key] = None
-                # del _grad
-                release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
-                del self._fstp_handler.reduce_scatter_handlers[key]
-                self._fstp_handler.reduce_scatter_handlers[key] = None
-                assert key in self._fstp_handler.reduce_scatter_handlers
-                
-                # if not hasattr(_param, "_fstp_all_reduce_str"):
-                #         continue
-
-                # key = getattr(_param, "_fstp_all_reduce_str")
-                # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key]
-                # comm_handle.wait()
-                # with torch.no_grad():
-                #     _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0)
-                # _param.grad.add_(_grad)
-                # # self._fstp_handler.reduce_scatter_handlers[key] = None
-                # del _grad
-                # del self._fstp_handler.all_reduce_handlers[key]
-                # self._fstp_handler.all_reduce_handlers[key] = None
-                # assert key in self._fstp_handler.all_reduce_handlers
-
-                current_bucket.reset_by_rank(reduce_rank)
-                
+        # otherwise, add the parameter into bucket.
         current_bucket.add_num_elements_in_bucket(param_size, reduce_rank)
         current_bucket.add_param(param, reduce_rank)
 
@@ -646,6 +611,10 @@ def step(self, closure=None):
         for group_id in range(self.num_param_groups):
             self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
 
+        # we need to accumulate gradients left in the accumulate gardient bucket
+        for group_id in range(self.num_param_groups):
+            self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id], reduce_rank=None)
+
         # compute norm for gradients in the before bucket
         groups_norms = []
         for group_id in range(self.num_param_groups):
@@ -685,16 +654,16 @@ def step(self, closure=None):
         timer("sync_grad").start()
         self._sync_grad()
         timer("sync_grad").stop()
-        
+
         print_memory("No 4")
-        
+
         try:
-            res =  self._step(closure=closure, norms=total_norms)
+            res = self._step(closure=closure, norms=total_norms)
         except torch.cuda.OutOfMemoryError as e:
             print(e, flush=True)
             print(torch.cuda.memory_summary(), flush=True)
             torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
-            
+
         return res
 
     def _step(self, closure=None, norms=None):
@@ -822,7 +791,7 @@ def _step(self, closure=None, norms=None):
         torch.cuda.synchronize()
         with torch.cuda.stream(self._comm_bcast_stream):
             self.broadcast_params()
-        
+
         timer("step").stop()
 
         # update gradients may not be needed here, because the sync_params function is used in initialization,

From d91a5d9d9ec8c7b0444b533a6b44be4430c7c199 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 20 Oct 2023 15:59:40 +0800
Subject: [PATCH 037/153] feat(initialize/launch.py): refactor config for fstp

---
 configs/7B_sft.py                             | 10 ++---
 internlm/initialize/launch.py                 | 23 ++++++----
 internlm/model/modeling_internlm.py           | 14 +++---
 internlm/model/multi_head_attention.py        |  8 ++--
 .../solver/optimizer/hybrid_zero_optim.py     | 45 ++++++++++++-------
 internlm/train/training_internlm.py           |  3 +-
 internlm/utils/evaluation.py                  |  4 +-
 7 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 6ea8b96e..c51c8129 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -152,19 +152,19 @@
     2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
 pipeline parallel (dict):
     1. size: int, the size of pipeline parallel.
     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
         defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="fstp", overlap=True),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=True,
 )
 
 cudnn_deterministic = False
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 80611fee..0e74f76b 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -306,15 +306,20 @@ def args_sanity_check():
         ), "sequence parallel does not support use_flash_attn=False"
 
     if isinstance(gpc.config.parallel["tensor"], int):
-        gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="origin_tp")
-
-    if gpc.config.parallel["tensor"].get("mode", None) is None:
-        gpc.config.parallel["tensor"]["mode"] = "origin_tp"
-
-    if gpc.config.parallel["tensor"].get("mode", None) == "fstp":
-        assert (
-            gpc.config.parallel.sequence_parallel is True
-        ), "when the tp_mode is fstp, the sequence_parallel should be True."
+        gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], sp="none", intern_overlap=False)
+    if gpc.config.parallel["tensor"].get("sp", None) is None:
+        gpc.config.parallel["tensor"]["sp"] = "none"
+    if gpc.config.parallel["tensor"].get("intern_overlap", None) is None:
+        gpc.config.parallel["tensor"]["intern_overlap"] = False
+    assert gpc.config.parallel["tensor"].get("sp", None) in [
+        "none",
+        "megatron",
+        "flash-attn",
+        "intern",
+    ], "invalid sp mode, only ['none', 'megatron', 'flash-attn', 'intern'] is supported"
+    # adapt to old version's sequence parallel config
+    if gpc.config.parallel["tensor"].get("sp", None) in ["megatron", "flash-attn", "intern"]:
+        gpc.config.parallel.sequence_parallel = True
 
     # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy
     if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1:
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 0df2b60e..9b6420d4 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -77,7 +77,7 @@ def __init__(
         use_scaled_init: bool = True,
         use_swiglu: bool = True,
         use_flash_attn: bool = True,
-        tp_mode: str = "origin_tp",
+        sp_mode: str = "none",
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -102,7 +102,7 @@ def __init__(
             use_flash_attn=use_flash_attn,
             device=device,
             dtype=dtype,
-            tp_mode=tp_mode,
+            sp_mode=sp_mode,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -114,7 +114,7 @@ def __init__(
             self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
 
         if use_swiglu:
-            mlp_cls = FeedForward if tp_mode == "origin_tp" else FSTPFeedForward
+            mlp_cls = FSTPFeedForward if sp_mode == "intern" else FeedForward
             self.mlp = mlp_cls(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
@@ -297,7 +297,7 @@ def __init__(
         super().__init__()
 
         checkpoint_layer_num = int(num_layers * checkpoint)
-        self.tp_mode = gpc.config.parallel["tensor"]["mode"]
+        self.sp_mode = gpc.config.parallel["tensor"]["sp"]
 
         if is_reward:
             head_cls = RewardModelLinear
@@ -343,7 +343,7 @@ def __init__(
                     use_scaled_init=use_scaled_init,
                     use_swiglu=use_swiglu,
                     use_flash_attn=use_flash_attn,
-                    tp_mode=self.tp_mode,
+                    sp_mode=self.sp_mode,
                 )
                 for lid in range(num_layers)
             ]
@@ -389,8 +389,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             assert len(indexes) == 1
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
-            # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension.
-            if gpc.config.parallel.sequence_parallel and self.tp_mode == "fstp":
+            # if the sequence parallel mode is 'intern', the indexes should also be split in sequence dimension.
+            if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern":
                 indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 8dcd3f96..cb0efb85 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -175,7 +175,7 @@ def __init__(
         use_flash_attn: bool = True,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
-        tp_mode: str = "origin_tp",
+        sp_mode: str = "none",
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
@@ -203,7 +203,7 @@ def __init__(
                 self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)
 
         # notice here should change bias=True
-        Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
+        Wqkv_cls = FSTPLinear if sp_mode == "intern" else ColumnParallelLinearTorch
         self.Wqkv = Wqkv_cls(
             embed_dim,
             3 * embed_dim,
@@ -219,12 +219,12 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        if tp_mode == "fstp":
+        if sp_mode == "intern":
             self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group)
             self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group)
 
         # output projection always have the bias (for now)
-        out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
+        out_proj_cls = FSTPLinear if sp_mode == "intern" else RowParallelLinearTorch
         self.out_proj = out_proj_cls(
             embed_dim,
             embed_dim,
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 96a54c01..a4b31737 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -10,7 +10,10 @@
 
 from internlm.core.context import Config, ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import split_forward_gather_backward, release_reduce_scatter_memory_pool
+from internlm.model.utils import (
+    release_reduce_scatter_memory_pool,
+    split_forward_gather_backward,
+)
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
     BucketStore,
@@ -40,8 +43,20 @@
 inf = math.inf
 logger = get_logger(__file__)
 
+
 def print_memory(msg):
-    print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
+    print(
+        msg,
+        " rank = ",
+        gpc.get_global_rank(),
+        " memory allocated: ",
+        torch.cuda.memory_allocated() / 1024 / 1024 / 1024,
+        " reverved memory: ",
+        torch.cuda.memory_reserved() / 1024 / 1024 / 1024,
+        " max memory: ",
+        torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024,
+        flush=True,
+    )
     print("===========================================")
 
 
@@ -69,8 +84,8 @@ def __init__(
         backoff_factor = grad_scal_cfg.backoff_factor
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
-        
-        if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
+
+        if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
             self._fstp_handler = gpc.config.fstp_handler
 
         # Zero related args
@@ -306,7 +321,7 @@ def _define_and_attach(param, reduce_rank=None):
                             param=param,
                             reduce_rank=reduce_rank,
                         )
-                        
+
                         reduce_scatter_checker = partial(
                             self._wait_reduce_scatter_and_accumulate_grad,
                             param=param,
@@ -354,7 +369,7 @@ def reset_reduce_bucket(self) -> None:
                     _param.grad.add_(_grad)
                     # self._fstp_handler.reduce_scatter_handlers[key] = None
                     # del _grad
-                    release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
+                    release_reduce_scatter_memory_pool(size=tuple(_grad.size()), index=_grad.index)
                     del self._fstp_handler.reduce_scatter_handlers[key]
                     self._fstp_handler.reduce_scatter_handlers[key] = None
                     assert key in self._fstp_handler.reduce_scatter_handlers
@@ -374,7 +389,7 @@ def reset_reduce_bucket(self) -> None:
                     # assert key in self._fstp_handler.all_reduce_handlers
 
                 bucket.reset_by_rank(rank)
-                
+
     def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None):
         param_size = param.numel()
 
@@ -397,11 +412,11 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None):
                 _param.grad.add_(_grad)
                 # self._fstp_handler.reduce_scatter_handlers[key] = None
                 # del _grad
-                release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
+                release_reduce_scatter_memory_pool(size=tuple(_grad.size()), index=_grad.index)
                 del self._fstp_handler.reduce_scatter_handlers[key]
                 self._fstp_handler.reduce_scatter_handlers[key] = None
                 assert key in self._fstp_handler.reduce_scatter_handlers
-                
+
                 # if not hasattr(_param, "_fstp_all_reduce_str"):
                 #         continue
 
@@ -418,7 +433,7 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None):
                 # assert key in self._fstp_handler.all_reduce_handlers
 
                 current_bucket.reset_by_rank(reduce_rank)
-                
+
         current_bucket.add_num_elements_in_bucket(param_size, reduce_rank)
         current_bucket.add_param(param, reduce_rank)
 
@@ -685,16 +700,16 @@ def step(self, closure=None):
         timer("sync_grad").start()
         self._sync_grad()
         timer("sync_grad").stop()
-        
+
         print_memory("No 4")
-        
+
         try:
-            res =  self._step(closure=closure, norms=total_norms)
+            res = self._step(closure=closure, norms=total_norms)
         except torch.cuda.OutOfMemoryError as e:
             print(e, flush=True)
             print(torch.cuda.memory_summary(), flush=True)
             torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
-            
+
         return res
 
     def _step(self, closure=None, norms=None):
@@ -822,7 +837,7 @@ def _step(self, closure=None, norms=None):
         torch.cuda.synchronize()
         with torch.cuda.stream(self._comm_bcast_stream):
             self.broadcast_params()
-        
+
         timer("step").stop()
 
         # update gradients may not be needed here, because the sync_params function is used in initialization,
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 5205ba5b..53996b38 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -110,9 +110,8 @@ def initialize_model():
 
     gpc.config.fstp_handler = None
 
-    if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
+    if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
         handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
-        # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         handler._register_sync_parameters_hook()
         gpc.config.fstp_handler = handler
 
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index 968a1db1..f708fa78 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -54,7 +54,7 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
 def switch_sequence_parallel_mode():
     prev_mode = gpc.config.parallel.sequence_parallel
     try:
-        if gpc.config.parallel["tensor"]["mode"] == "fstp":
+        if gpc.config.parallel["tensor"]["sp"] == "intern":
             gpc.config.parallel.sequence_parallel = True
         else:
             gpc.config.parallel.sequence_parallel = False
@@ -106,7 +106,7 @@ def evaluate_on_val_dls(
                         total_val_bsz = len(batch[1])
                         assert total_val_bsz % data_cfg.micro_bsz == 0
                         num_microbatches = total_val_bsz // data_cfg.micro_bsz
-                        if gpc.config.parallel["tensor"]["mode"] == "fstp":
+                        if gpc.config.parallel["tensor"]["sp"] == "intern":
                             sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
                             tensor_shape = torch.Size(
                                 [

From eac382ad0a0ed6075b31fbdb8a56d42239fa9f4f Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 20 Oct 2023 16:22:29 +0800
Subject: [PATCH 038/153] feat(optimizer/hybrid_zero_optim.py): fix lint error

---
 internlm/model/utils.py                        | 5 ++---
 internlm/solver/optimizer/hybrid_zero_optim.py | 5 +----
 internlm/solver/optimizer/store.py             | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index b9c7c03a..19531e4a 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Any, Optional, Union
+from typing import Optional
 
 import fused_dense_lib as fused_dense_cuda
 import torch
 import torch.nn.functional as F
-from flash_attn.utils.distributed import all_reduce_raw  # , reduce_scatter_raw
+from flash_attn.utils.distributed import all_reduce_raw
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
@@ -397,7 +397,6 @@ def backward(ctx, grad_output, *args):
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
         all_gather_handler = ctx.all_gather_handler
-        module = ctx.module
         block_index = ctx.block_index
         module_name = ctx.module_name
 
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index d5fec315..cb8aa659 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -11,10 +11,7 @@
 
 from internlm.core.context import Config, ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import (
-    release_reduce_scatter_memory_pool,
-    split_forward_gather_backward,
-)
+from internlm.model.utils import release_reduce_scatter_memory_pool
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
     BucketStore,
diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py
index 228045ed..f486ccec 100644
--- a/internlm/solver/optimizer/store.py
+++ b/internlm/solver/optimizer/store.py
@@ -45,7 +45,7 @@ def __init__(self, group_id, dp_parallel_mode):
 
     def num_elements_in_bucket(self, reduce_rank: int = None):
         return self._num_elements_in_bucket[reduce_rank]
-    
+
     def num_params_in_bucket(self, reduce_rank: int = None):
         return len(self._params[reduce_rank])
 

From 2acf9b817f6888e73c3606ddc6549f8c95694b27 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 20 Oct 2023 16:25:08 +0800
Subject: [PATCH 039/153] feat(utils/gputest.py): fix lint error

---
 internlm/utils/gputest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py
index 52d96385..bf4cf1c9 100644
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@@ -45,7 +45,7 @@ def empty_cache_and_diag(batch_count, interval=50):
             #         # import time
             #         # time.sleep(10)
             #         print(e, "rank = ", gpc.get_global_rank(), flush=True)
-                    # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+            #         torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
 
         # do empty_cache after the bench
         torch.cuda.empty_cache()

From dcd89ed30466b7552f79077af5049e3581d46270 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Fri, 20 Oct 2023 17:50:56 +0800
Subject: [PATCH 040/153] refactor linear

---
 configs/7B_sft.py                             |   2 +-
 internlm/model/linear.py                      | 350 ++++++++----------
 internlm/model/modeling_internlm.py           |  24 +-
 internlm/model/multi_head_attention.py        |  12 +-
 internlm/model/utils.py                       | 206 +++++++++--
 .../solver/optimizer/hybrid_zero_optim.py     |  56 +--
 internlm/train/training_internlm.py           |   3 +-
 train.py                                      |   4 +-
 8 files changed, 357 insertions(+), 300 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 6ea8b96e..0058e04f 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -162,7 +162,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="fstp", overlap=True),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=True,
 )
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 4f05cd32..8f57a02a 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -19,25 +19,26 @@
     all_gather_raw_memory_pool,
     fstp_fused_dense_func,
     fused_dense_func_torch,
+    megatron_fused_dense_func_torch,
 )
 
 
-class ScaleColumnParallelLinear(nn.Linear):
+class BaseScaleColumnParallelLinear(nn.Linear):
     """
-    ScaleColumnParallelLinear.
-
-    Args:
-        in_features (int): size of each input sample
-        out_features (int): size of each output sample
-        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
-        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
-                    in the config.
-        sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-                                    we do an all_gather of x before doing the matmul.
-                                    If not, then the input is already gathered.
-        device (Optional[Union[str, torch.device]]): The device will be used.
-        dtype (Optional[torch.dtype]): The type of data.
-        weight_scale (int): For training stability. 1 by default.
+        Base class for ScaleColumnParallelLinear.
+
+        Args:
+            in_features (int): size of each input sample
+            out_features (int): size of each output sample
+            process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+            bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                        in the config.
+            sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+                                        we do an all_gather of x before doing the matmul.
+                                        If not, then the input is already gathered.
+            device (Optional[Union[str, torch.device]]): The device will be used.
+            dtype (Optional[torch.dtype]): The type of data.
+            weight_scale (int): For training stability. 1 by default.
     """
 
     def __init__(
@@ -57,6 +58,10 @@ def __init__(
         self.process_group = process_group
         self.weight_scale = weight_scale
 
+class ScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
+    """
+    ScaleColumnParallelLinear in flash implementation.
+    """
     def forward(self, input, gather_dim=0):  # pylint: disable=W0622
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
         # we do an all_gather of x before doing the matmul.
@@ -74,6 +79,27 @@ def forward(self, input, gather_dim=0):  # pylint: disable=W0622
             gather_dim=gather_dim,
         )
 
+class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
+    """
+    ScaleColumnParallelLinear in megatron implementation.
+    """
+
+    def forward(self, input, gather_dim=0):  # pylint: disable=W0622
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        if self.weight_scale != 1:
+            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
+        else:
+            weight = self.weight
+        return megatron_fused_dense_func_torch(
+            input,
+            weight,
+            self.bias,
+            process_group=self.process_group,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            gather_dim=gather_dim,
+        )
 
 class RewardModelLinear(ScaleColumnParallelLinear):
     """
@@ -129,7 +155,6 @@ def forward(self, x, gather_dim=0):
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
         # we do an all_gather of x before doing the matmul.
         # If not, then the input is already gathered.
-
         return fused_dense_func_torch(
             x,
             self.weight,
@@ -139,6 +164,19 @@ def forward(self, x, gather_dim=0):
             gather_dim=gather_dim,
         )
 
+class MegatronColumnParallelLinearTorch(ColumnParallelLinear):
+    def forward(self, x, gather_dim=0):
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        return megatron_fused_dense_func_torch(
+            x,
+            self.weight,
+            self.bias,
+            process_group=self.process_group,
+            sequence_parallel=self.sequence_parallel,
+            gather_dim=gather_dim,
+        )
 
 class RowParallelLinearTorch(RowParallelLinear):
     def forward(self, x):
@@ -150,10 +188,20 @@ def forward(self, x):
         reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
         return reduce_fn(out, self.process_group)
 
+class MegatronRowParallelLinearTorch(RowParallelLinear):
+    def forward(self, x):
+        """
+        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
+        a reduce_scatter of the result.
+        """
+        out = megatron_fused_dense_func_torch(x, self.weight, self.bias)
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)
+
 
-class FeedForward(nn.Module):
+class BaseFeedForward(nn.Module):
     """
-    FeedForward.
+    Base FeedForward in flash implementation.
 
     Args:
         in_features (int): size of each input sample
@@ -177,13 +225,13 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
-        block_idx: int = 0,
+        colum_cls = None,
+        row_cls = None,
     ):
         super().__init__()
-
         hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
 
-        self.w1 = ColumnParallelLinearTorch(
+        self.w1 = colum_cls(
             in_features,
             hidden_features,
             process_group,
@@ -192,7 +240,7 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        self.w2 = ColumnParallelLinearTorch(
+        self.w2 = colum_cls(
             in_features,
             hidden_features,
             process_group,
@@ -201,7 +249,7 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        self.w3 = RowParallelLinearTorch(
+        self.w3 = row_cls(
             hidden_features,
             out_features,
             process_group,
@@ -217,6 +265,66 @@ def forward(self, x):
         out = self.w3(Silu(w1_o, w2_o))
         return out
 
+class FeedForward(BaseFeedForward):
+    """
+    FeedForward in flash implementation.
+
+    Args:
+        in_features (int): size of each input sample
+        hidden_features (int): size of hidden state of FFN
+        out_features (int): size of each output sample
+        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_features: int = None,
+        process_group: Optional[torch.distributed.ProcessGroup] = None,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        multiple_of: int = 256,
+    ):
+        super().__init__(in_features, hidden_features, out_features, process_group, bias, device, 
+                         dtype, multiple_of, ColumnParallelLinearTorch, RowParallelLinearTorch)
+       
+
+class MegatronFeedForward(BaseFeedForward):
+    """
+    FeedForward in megatron implementation.
+
+    Args:
+        in_features (int): size of each input sample
+        hidden_features (int): size of hidden state of FFN
+        out_features (int): size of each output sample
+        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_features: int = None,
+        process_group: Optional[torch.distributed.ProcessGroup] = None,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        multiple_of: int = 256,
+    ):
+        super().__init__(in_features, hidden_features, out_features, process_group, bias, device,
+                         dtype, multiple_of, MegatronColumnParallelLinearTorch, MegatronRowParallelLinearTorch)
 
 class FSTPLinear(ColumnParallelLinear):
     def forward(self, x):
@@ -228,10 +336,9 @@ def forward(self, x):
             module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name
         )
 
-
-class FSTPFeedForward(nn.Module):
+class FSTPFeedForward(BaseFeedForward):
     """
-    FeedForward.
+    FeedForward in FSTP.
 
     Args:
         in_features (int): size of each input sample
@@ -255,169 +362,35 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
-        block_idx: int = 0,
     ):
-        super().__init__()
-
-        hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
-
-        self.w1 = FSTPLinear(
-            in_features,
-            hidden_features,
-            process_group,
-            bias,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            device=device,
-            dtype=dtype,
-        )
-        self.w2 = FSTPLinear(
-            in_features,
-            hidden_features,
-            process_group,
-            bias,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            device=device,
-            dtype=dtype,
-        )
-        self.w3 = FSTPLinear(
-            hidden_features,
-            out_features,
-            process_group,
-            bias=bias,
-            sequence_parallel=gpc.config.parallel.sequence_parallel,
-            device=device,
-            dtype=dtype,
-        )
-
-    def forward(self, x):
-        w1_o = self.w1(x)
-        w2_o = self.w2(x)
-        out = self.w3(F.silu(w1_o) * w2_o)
-        return out
-
-
-class FSTPAllGatherSyncHandler:
-    """
-    All-gather handler for overlapping the all-gather in adjcent FSTP linear.
-    """
-
-    def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
-        # import pdb; pdb.set_trace()
-        self.process_group = process_group
-        self.FSTP_modules = []
-        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
-        self.FSTP_global_weights = dict()  # key: FSTP module; value: module global weight for forward
-        self.module_handler = dict()  # key: FSTP module; value: all-gather handler
-        self.module_block = dict()  # key: FSTP module; value: transformer block index
-        self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
-        self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
-
-        self.reduce_scatter_handlers = {}
-        self.all_reduce_handlers = {}
-
-        # just want to share same for loop for ModuleList and Module
-        if not isinstance(model, nn.ModuleList):
-            model = [model]
-
-        for _chunk in model:
-            if isinstance(_chunk, NaiveAMPModel):
-                _chunk = _chunk.model
-
-            for _chunk_name, children in _chunk.named_children():
-                if isinstance(children, nn.ModuleList):
-                    for idx, block in enumerate(children):
-                        index = 0
-                        self.block_module[idx] = {}
-                        for _sub_name, sub in block.named_children():
-                            sub_modules = list(sub.children())
-                            if len(sub_modules) > 0:
-                                for name, child in sub.named_children():
-                                    if isinstance(child, FSTPLinear):
-
-                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
-                                        if child.bias is not None:
-                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
-
-                                        self.FSTP_modules.append(child)
-                                        self.module_block[child] = idx
-                                        self.block_module[idx][index] = child
-                                        self.module_name_index[child] = index
-                                        index = index + 1
-                            else:
-                                continue
-
-    def _register_sync_parameters_hook(self) -> None:
-        """
-        register pre_forward_hook and pre_backward_hook for FSTPLinear.
-        """
-
-        def _pre_forward_hook(module: nn.Module, inputs: Any):
-            block_index = self.module_block[module]
-            name_index = self.module_name_index[module]
-            if name_index == 0:
-                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                weight_handler.wait()
-                self.FSTP_global_weights[module] = total_weight
-
-                # start the all-gather for next module
-                next_module = self.block_module[block_index][name_index + 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                    next_module.weight, self.process_group, async_op=True
-                )
-                self.module_handler[next_module] = weights_handler
-            else:
-                handler = self.module_handler[module]
-                handler.wait()
-                if name_index != 4:
-                    next_module = self.block_module[block_index][name_index + 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                        next_module.weight, self.process_group, async_op=True
-                    )
-                    self.module_handler[next_module] = weights_handler
-
-        def _post_forward_hook(module: nn.Module, input, output):
-            if module in self.FSTP_global_weights:
-                del self.FSTP_global_weights[module]
-            if module in self.module_handler:
-                del self.module_handler[module]
-
-        def _pre_backward_hook(module: nn.Module, grad_output):
-            block_index = self.module_block[module]
-            name_index = self.module_name_index[module]
-            if name_index == 4:
-                total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True)
-                weight_handler.wait()
-                self.FSTP_global_weights[module] = total_weight
-
-                # start the all-gather for next module
-                next_module = self.block_module[block_index][name_index - 1]
-                self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                    next_module.weight, self.process_group, async_op=True
-                )
-                self.module_handler[next_module] = weights_handler
-            else:
-                handler = self.module_handler[module]
-                handler.wait()
-                if name_index != 0:
-                    next_module = self.block_module[block_index][name_index - 1]
-                    self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(
-                        next_module.weight, self.process_group, async_op=True
-                    )
-                    self.module_handler[next_module] = weights_handler
-
-        def _post_backward_hook(module, grad_input, grad_output):
-            del self.FSTP_global_weights[module]
-
-        for module in self.FSTP_modules:
-            # import pdb; pdb.set_trace()
-            module.register_forward_pre_hook(_pre_forward_hook)
-            module.register_forward_hook(_post_forward_hook)
-            # module.register_backward_pre_hook(_pre_backward_hook)
-            # module.register_backward_hook(_post_backward_hook)
-            module.register_full_backward_pre_hook(_pre_backward_hook)
-            module.register_full_backward_hook(_post_backward_hook)
-
+        super().__init__(in_features, hidden_features, out_features, process_group, bias, device,
+                         dtype, multiple_of, FSTPLinear, FSTPLinear)
+
+def get_mlp_cls(sp_mode: str):
+    if sp_mode in ["none", "flash-attn"]:
+        mlp_cls = FeedForward
+    elif sp_mode == "megatron":
+        mlp_cls = MegatronFeedForward
+    else:
+        mlp_cls = FSTPFeedForward
+    return mlp_cls
+
+def get_linear_cls(sp_mode: str, parallel_mode: str):
+    if parallel_mode == "column":
+        if sp_mode in ["none", "flash-attn"]:
+            cls = ColumnParallelLinearTorch
+        elif sp_mode == "megatron":
+            cls = MegatronColumnParallelLinearTorch
+        else:
+            cls = FSTPLinear
+    elif parallel_mode == 'row':
+        if sp_mode in ["none", "flash-attn"]:
+            cls = RowParallelLinearTorch
+        elif sp_mode == "megatron":
+            cls = MegatronRowParallelLinearTorch
+        else:
+            cls = FSTPLinear
+    return cls
 
 class CoarseGrainedFSTPAllGatherSyncHandler:
     """
@@ -468,7 +441,6 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                             sub_modules = list(sub.children())
                             if len(sub_modules) > 0:
                                 for name, child in sub.named_children():
-                                    # print(f"name: {name}", flush=True)
                                     if name == "out_proj":
                                         self.FSTP_outs.append(child)
                                         self.module_to_index[child] = idx
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index b004dffa..99d540fd 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -15,9 +15,12 @@
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
     FeedForward,
+    MegatronFeedForward,
     FSTPFeedForward,
     RewardModelLinear,
     ScaleColumnParallelLinear,
+    MegatronScaleColumnParallelLinear,
+    get_mlp_cls,
 )
 from internlm.model.multi_head_attention import MHA
 from internlm.model.utils import (
@@ -77,8 +80,7 @@ def __init__(
         use_scaled_init: bool = True,
         use_swiglu: bool = True,
         use_flash_attn: bool = True,
-        tp_mode: str = "origin_tp",
-        block_idx: int = 0,
+        sp_mode: str = "none",
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -103,8 +105,7 @@ def __init__(
             use_flash_attn=use_flash_attn,
             device=device,
             dtype=dtype,
-            tp_mode=tp_mode,
-            block_idx=block_idx,
+            sp_mode=sp_mode,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -116,7 +117,7 @@ def __init__(
             self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
 
         if use_swiglu:
-            mlp_cls = FeedForward if tp_mode == "origin_tp" else FSTPFeedForward
+            mlp_cls = get_mlp_cls(sp_mode)
             self.mlp = mlp_cls(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
@@ -299,12 +300,16 @@ def __init__(
         super().__init__()
 
         checkpoint_layer_num = int(num_layers * checkpoint)
-        self.tp_mode = gpc.config.parallel["tensor"]["mode"]
+        self.sp_mode = gpc.config.parallel["tensor"]["sp"]
+        if self.sp_mode == "none":
+            gpc.config.parallel.sequence_parallel = False
+        else:
+            gpc.config.parallel.sequence_parallel = True
 
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            head_cls = ScaleColumnParallelLinear
+            head_cls = ScaleColumnParallelLinear if self.sp_mode in ["flash-attn", "none", "intern"] else MegatronScaleColumnParallelLinear
         if first:
             if embed_split_hidden:
                 self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
@@ -345,8 +350,7 @@ def __init__(
                     use_scaled_init=use_scaled_init,
                     use_swiglu=use_swiglu,
                     use_flash_attn=use_flash_attn,
-                    tp_mode=self.tp_mode,
-                    block_idx=lid,
+                    sp_mode=self.sp_mode,
                 )
                 for lid in range(num_layers)
             ]
@@ -393,7 +397,7 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
             # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension.
-            if gpc.config.parallel.sequence_parallel and self.tp_mode == "fstp":
+            if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern":
                 indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 7a0f4ed7..8ba49edd 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -42,6 +42,9 @@
     ColumnParallelLinearTorch,
     FSTPLinear,
     RowParallelLinearTorch,
+    MegatronColumnParallelLinearTorch,
+    MegatronRowParallelLinearTorch,
+    get_linear_cls,
 )
 
 
@@ -175,8 +178,7 @@ def __init__(
         use_flash_attn: bool = True,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
-        tp_mode: str = "origin_tp",
-        block_idx: int = 0,
+        sp_mode: str = "none",
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
@@ -204,7 +206,7 @@ def __init__(
                 self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)
 
         # notice here should change bias=True
-        Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
+        Wqkv_cls = get_linear_cls(sp_mode, "column")
         self.Wqkv = Wqkv_cls(
             embed_dim,
             3 * embed_dim,
@@ -220,12 +222,12 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        if tp_mode == "fstp":
+        if sp_mode == "intern":
             self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group)
             self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group)
 
         # output projection always have the bias (for now)
-        out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear
+        out_proj_cls = get_linear_cls(sp_mode, 'row')
         self.out_proj = out_proj_cls(
             embed_dim,
             embed_dim,
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 2667efed..6757906c 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -164,7 +164,7 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup,
 
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFunc(torch.autograd.Function):
-    "tp fused dense function"
+    "FusedDenseFunc for tensor parallel in flash-attn implementation."
 
     @staticmethod
     @custom_fwd
@@ -255,9 +255,96 @@ def backward(ctx, grad_output, *args):
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
+class MegatronFusedDenseFunc(torch.autograd.Function):
+    '''
+    FusedDenseFunc for tensor parallel in megatron implementation.
+    The diffenrence between the implementation of flash-attn and megatron is that the total_x could be saved for backward in megatron,
+    so that the all-gather in backward is ommited.
+    '''
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
+        """
+        ctx.compute_weight_gradient = weight.requires_grad
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim)
+        else:
+            total_x = x
+
+        if torch.is_autocast_enabled():
+            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
+            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+        weight = weight.contiguous()
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight.shape) > 65535 * 32:
+            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
+        output = F.linear(total_x, weight, bias)
+        if ctx.compute_weight_gradient:
+            ctx.save_for_backward(total_x, weight)
+        else:
+            ctx.save_for_backward(weight)
+        return output if not return_residual else (output, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+
+        if ctx.compute_weight_gradient:
+            total_x, weight = ctx.saved_tensors
+        else:
+            (weight,) = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None, None
+
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFuncTorch(FusedDenseFunc):
-    """A custom PyTorch module extending FusedDenseFunc."""
+    '''FusedDenseFunc in flash implementation for supporting torch.float32'''
 
     @staticmethod
     @custom_bwd
@@ -307,17 +394,61 @@ def backward(ctx, grad_output, *args):
             handle_grad_input.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
+class MegatronFusedDenseFuncTorch(FusedDenseFunc):
+    '''FusedDenseFunc in megatron implementation for supporting torch.float32'''
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        gather_dim = ctx.gather_dim
+        if ctx.compute_weight_gradient:
+            total_x, weight = ctx.saved_tensors
+        else:
+            (weight,) = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            # we remove the cuda independence, which is different from flash_attn.
+            grad_weight, grad_bias = linear_bias_wgrad_torch(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 class FSTPFusedDenseFunc(torch.autograd.Function):
-    "FSTP fused dense function"
+    "FusedDenseFunc for FSTP, which is optimized based on flash implementation."
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None, block_index=None, module_name=None):
+    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, overlap_handler=None, block_index=None, module_name=None):
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
         ctx.process_group = process_group
-        ctx.all_gather_handler = all_gather_handler
+        ctx.overlap_handler = overlap_handler
         ctx.module = module
         ctx.block_index = block_index
         ctx.module_name = module_name
@@ -329,13 +460,12 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
-            if all_gather_handler is not None:# and module in all_gather_handler.FSTP_global_weights:
-                    # total_weight = all_gather_handler.FSTP_global_weights[module]
-                    total_weight = gpc.config.block_memory[block_index % 2][module_name]   
+            if overlap_handler is not None:
+                total_weight = gpc.config.block_memory[block_index % 2][module_name]   
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
-
+            # TODO memory pool for bias
             if bias is not None:
                 total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
                 handle_bias.wait()
@@ -356,6 +486,7 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod
         if min(batch_dim, n, *total_weight.shape) > 65535 * 32:
             raise RuntimeError("fused_dense only supports matrix dims <= 2M")
         output = F.linear(total_x, total_weight, total_bias)
+        # release memory
         del total_weight
         del total_bias
         if ctx.compute_weight_gradient:
@@ -372,8 +503,7 @@ def backward(ctx, grad_output, *args):
             (grad_input,) = args
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
-        all_gather_handler = ctx.all_gather_handler
-        module = ctx.module
+        overlap_handler = ctx.overlap_handler
         block_index = ctx.block_index
         module_name = ctx.module_name
         
@@ -389,51 +519,35 @@ def backward(ctx, grad_output, *args):
 
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
-            total_weight = gpc.config.block_memory[block_index % 2][module_name]
-            # # do all-gather for weight before backward
-            # if module in all_gather_handler.FSTP_global_weights:
-            #     total_weight = all_gather_handler.FSTP_global_weights[module]
-            # else:
-            #     total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-            #     handle_weight.wait()
+            if overlap_handler is not None:
+                total_weight = gpc.config.block_memory[block_index % 2][module_name]
+            else:
+                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+                handle_weight.wait()
         else:
             total_weight = weight
 
         # compute weight grad
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
-
             grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
                 total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
             )
             if world_size > 1:
-                if gpc.config.fstp_handler is not None:
-                    # grad_weight_async, handle_grad_weight = all_reduce_raw(grad_weight, process_group, async_op=True)
-                    # assert hasattr(weight, "_fstp_all_reduce_str")
-                    # all_gather_handler.all_reduce_handlers[weight._fstp_all_reduce_str] = (handle_grad_weight, grad_weight_async)
-                    # grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
-                    # if grad_bias is not None:
-                    #     grad_bias_async, handle_grad_bias = all_reduce_raw(grad_bias, process_group, async_op=True)
-                    #     assert hasattr(bias, "_fstp_all_reduce_str")
-                    #     all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async)
-                    #     grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
-                    
+                if overlap_handler is not None:
                     grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True)
                     assert hasattr(weight, "_fstp_reduce_scatter_str")
-                    all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
-                    grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
+                    overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
+                    grad_weight = overlap_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
                     if grad_bias is not None:
                         grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(grad_bias, process_group, async_op=True)
                         assert hasattr(bias, "_fstp_reduce_scatter_str")
-                        all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
-                        grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
+                        overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
+                        grad_bias = overlap_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
                 else:
                     grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
                     if grad_bias is not None:
                         grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
-                # grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
-                # if grad_bias is not None:
-                #     grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
@@ -449,7 +563,7 @@ def backward(ctx, grad_output, *args):
         del total_weight
 
         if ctx.needs_input_grad[1]:
-            if world_size > 1 and gpc.config.fstp_handler is None:
+            if world_size > 1 and overlap_handler is None:
                 handle_grad_weight.wait()
                 if grad_bias is not None:
                     handle_grad_bias.wait()
@@ -473,6 +587,22 @@ def fused_dense_func_torch(
     else:
         return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
 
+def megatron_fused_dense_func_torch(
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    return_residual: bool = False,
+    process_group: Optional[ProcessGroup] = None,
+    sequence_parallel: bool = True,
+    gather_dim: int = 0,
+):
+    dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
+        x.dtype == torch.float32 and torch.is_autocast_enabled()
+    )
+    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
+        return MegatronFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
+    else:
+        return MegatronFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
 
 def fstp_fused_dense_func(
     x: Tensor,
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 96a54c01..4de5c7cd 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -40,11 +40,6 @@
 inf = math.inf
 logger = get_logger(__file__)
 
-def print_memory(msg):
-    print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
-    print("===========================================")
-
-
 class HybridZeroOptimizer(BaseOptimizer):
     """
     Hybrid Zero Optimizer.
@@ -70,7 +65,7 @@ def __init__(
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
         
-        if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
+        if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] == True:
             self._fstp_handler = gpc.config.fstp_handler
 
         # Zero related args
@@ -358,20 +353,7 @@ def reset_reduce_bucket(self) -> None:
                     del self._fstp_handler.reduce_scatter_handlers[key]
                     self._fstp_handler.reduce_scatter_handlers[key] = None
                     assert key in self._fstp_handler.reduce_scatter_handlers
-                    # if not hasattr(_param, "_fstp_all_reduce_str"):
-                    #     continue
-
-                    # key = getattr(_param, "_fstp_all_reduce_str")
-                    # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key]
-                    # comm_handle.wait()
-                    # with torch.no_grad():
-                    #     _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0)
-                    # _param.grad.add_(_grad)
-                    # # self._fstp_handler.reduce_scatter_handlers[key] = None
-                    # del _grad
-                    # del self._fstp_handler.all_reduce_handlers[key]
-                    # self._fstp_handler.all_reduce_handlers[key] = None
-                    # assert key in self._fstp_handler.all_reduce_handlers
+
 
                 bucket.reset_by_rank(rank)
                 
@@ -401,21 +383,6 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None):
                 del self._fstp_handler.reduce_scatter_handlers[key]
                 self._fstp_handler.reduce_scatter_handlers[key] = None
                 assert key in self._fstp_handler.reduce_scatter_handlers
-                
-                # if not hasattr(_param, "_fstp_all_reduce_str"):
-                #         continue
-
-                # key = getattr(_param, "_fstp_all_reduce_str")
-                # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key]
-                # comm_handle.wait()
-                # with torch.no_grad():
-                #     _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0)
-                # _param.grad.add_(_grad)
-                # # self._fstp_handler.reduce_scatter_handlers[key] = None
-                # del _grad
-                # del self._fstp_handler.all_reduce_handlers[key]
-                # self._fstp_handler.all_reduce_handlers[key] = None
-                # assert key in self._fstp_handler.all_reduce_handlers
 
                 current_bucket.reset_by_rank(reduce_rank)
                 
@@ -634,7 +601,6 @@ def step(self, closure=None):
 
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients
-        print_memory("No 1")
         if not self._overlap_sync_grad:
             for group_id in range(len(self._fp16_param_groups)):
                 for param in self._fp16_param_groups[group_id]:
@@ -659,7 +625,6 @@ def step(self, closure=None):
             bucket.empty()
         self._bucket_in_progress = []
         self._param_store.clear_grads_of_previous_reduced_params()
-        print_memory("No 2")
         # compute norm for gradients in the last bucket
         total_norms = {}
         for group_id in range(self.num_param_groups):
@@ -681,19 +646,11 @@ def step(self, closure=None):
                 scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float)
                 dist.all_reduce(scaled_norm_tensor, group=pg)
                 total_norms[group_name] = scaled_norm_tensor.item()
-        print_memory("No 3")
         timer("sync_grad").start()
         self._sync_grad()
         timer("sync_grad").stop()
         
-        print_memory("No 4")
-        
-        try:
-            res =  self._step(closure=closure, norms=total_norms)
-        except torch.cuda.OutOfMemoryError as e:
-            print(e, flush=True)
-            print(torch.cuda.memory_summary(), flush=True)
-            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+        res =  self._step(closure=closure, norms=total_norms)
             
         return res
 
@@ -740,7 +697,6 @@ def _step(self, closure=None, norms=None):
             self._grad_store._averaged_gradients = dict()
             self.zero_grad()
             return False, norms
-        print_memory("No 5")
         # copy the grad of fp16 param to fp32 param
         single_grad_partition_groups = []
         for group_id in range(self.num_param_groups):
@@ -781,7 +737,6 @@ def _step(self, closure=None, norms=None):
             single_grad_partition_groups.append(flat_fp32_avg_grads)
             device = self._fp32_flat_param_groups_of_current_rank[group_id].device
             self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
-        print_memory("No 6")
         # unscale and clip grads
         # get the global norm
         global_norm_groups = {}
@@ -804,12 +759,9 @@ def _step(self, closure=None, norms=None):
         # For those ranks that are not assigned parameters, we just wait for other ranks
         # to send them updated their own parameters.
         if self.has_params:
-            print_memory("No 7")
             self.optim.step()
-            print_memory("No 8")
             # release the fp32 grad
             release_param_grad(self._fp32_flat_param_groups_of_current_rank.values())
-            print_memory("No 9")
             # update fp16 partition updated by the current rank
             for group_id in range(len(self._fp16_param_groups)):
                 if self.param_group_has_params[group_id]:
@@ -818,7 +770,6 @@ def _step(self, closure=None, norms=None):
                     )
                     fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
                     fp16_param.data.copy_(fp32_param)
-            print_memory("No 10")
         torch.cuda.synchronize()
         with torch.cuda.stream(self._comm_bcast_stream):
             self.broadcast_params()
@@ -829,7 +780,6 @@ def _step(self, closure=None, norms=None):
         # so synchronization is maintained
         for group_name, global_norm in global_norm_groups.items():
             global_norm_groups[group_name] = global_norm / loss_scale
-        print_memory("No 11")
         return True, global_norm_groups
 
     def broadcast_params(self):
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2816da0e..20592c26 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -38,7 +38,6 @@
 from internlm.model.linear import (
     CoarseGrainedFSTPAllGatherSyncHandler,
     FeedForward,
-    FSTPAllGatherSyncHandler,
     RewardModelLinear,
     ScaleColumnParallelLinear,
 )
@@ -111,7 +110,7 @@ def initialize_model():
     
     gpc.config.fstp_handler = None
 
-    if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True:
+    if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] == True:
         handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
         handler._register_sync_parameters_hook()
diff --git a/train.py b/train.py
index 41ab070d..a917d121 100644
--- a/train.py
+++ b/train.py
@@ -195,7 +195,7 @@ def main(args):
         # start iterating the train data and begin training
         for batch_count in range(train_state.batch_count, total_steps):
             empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval)
-            torch.cuda.memory._record_memory_history()
+            # torch.cuda.memory._record_memory_history()
             start_time = time.time()
             timer("one-batch").start()
 
@@ -300,7 +300,7 @@ def main(args):
             if gpc.config.fstp_handler is not None:
                 gpc.config.fstp_handler.zero_const_pool = {}
                 gpc.config.fstp_handler.reduce_scatter_memory = {}
-            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+            # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 
     ckpt_manager.wait_async_upload_finish()

From 85ad917ae430c2e89cf4444221c2ced9223d3552 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 20 Oct 2023 21:50:32 +0800
Subject: [PATCH 041/153] feat(model/overlap_handler.py): refactor overlap hook
 handle

---
 configs/7B_sft.py                             |   2 +-
 internlm/model/linear.py                      | 298 +++++-------------
 internlm/model/modeling_internlm.py           |  11 +-
 internlm/model/multi_head_attention.py        |  11 +-
 internlm/model/overlap_handler.py             | 253 +++++++++++++++
 internlm/model/utils.py                       |  98 +++---
 .../solver/optimizer/hybrid_zero_optim.py     |  12 +-
 internlm/train/training_internlm.py           |  56 +---
 train.py                                      |   2 +-
 9 files changed, 393 insertions(+), 350 deletions(-)
 create mode 100644 internlm/model/overlap_handler.py

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 09af7f45..c51c8129 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=True),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 2bbb9416..6cd3b9c8 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -1,22 +1,17 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Any, Optional, Union
+from typing import Optional
 
 import torch
-import torch.nn.functional as F
 from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
 from flash_attn.utils.distributed import all_reduce, reduce_scatter
 from torch import nn
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.naive_amp import NaiveAMPModel
-from internlm.model.embedding import Embedding1D
 from internlm.model.utils import (
     Silu,
-    all_gather_raw,
-    all_gather_raw_memory_pool,
     fstp_fused_dense_func,
     fused_dense_func_torch,
     megatron_fused_dense_func_torch,
@@ -25,20 +20,20 @@
 
 class BaseScaleColumnParallelLinear(nn.Linear):
     """
-        Base class for ScaleColumnParallelLinear.
-
-        Args:
-            in_features (int): size of each input sample
-            out_features (int): size of each output sample
-            process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
-            bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
-                        in the config.
-            sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-                                        we do an all_gather of x before doing the matmul.
-                                        If not, then the input is already gathered.
-            device (Optional[Union[str, torch.device]]): The device will be used.
-            dtype (Optional[torch.dtype]): The type of data.
-            weight_scale (int): For training stability. 1 by default.
+    Base class for ScaleColumnParallelLinear.
+
+    Args:
+        in_features (int): size of each input sample
+        out_features (int): size of each output sample
+        process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`.
+        bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False
+                    in the config.
+        sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+                                    we do an all_gather of x before doing the matmul.
+                                    If not, then the input is already gathered.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        weight_scale (int): For training stability. 1 by default.
     """
 
     def __init__(
@@ -58,10 +53,12 @@ def __init__(
         self.process_group = process_group
         self.weight_scale = weight_scale
 
+
 class ScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
     """
     ScaleColumnParallelLinear in flash implementation.
     """
+
     def forward(self, input, gather_dim=0):  # pylint: disable=W0622
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
         # we do an all_gather of x before doing the matmul.
@@ -79,6 +76,7 @@ def forward(self, input, gather_dim=0):  # pylint: disable=W0622
             gather_dim=gather_dim,
         )
 
+
 class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
     """
     ScaleColumnParallelLinear in megatron implementation.
@@ -101,6 +99,7 @@ def forward(self, input, gather_dim=0):  # pylint: disable=W0622
             gather_dim=gather_dim,
         )
 
+
 class RewardModelLinear(ScaleColumnParallelLinear):
     """
     RewardModelLinear.
@@ -164,6 +163,7 @@ def forward(self, x, gather_dim=0):
             gather_dim=gather_dim,
         )
 
+
 class MegatronColumnParallelLinearTorch(ColumnParallelLinear):
     def forward(self, x, gather_dim=0):
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
@@ -178,6 +178,7 @@ def forward(self, x, gather_dim=0):
             gather_dim=gather_dim,
         )
 
+
 class RowParallelLinearTorch(RowParallelLinear):
     def forward(self, x):
         """
@@ -188,6 +189,7 @@ def forward(self, x):
         reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
         return reduce_fn(out, self.process_group)
 
+
 class MegatronRowParallelLinearTorch(RowParallelLinear):
     def forward(self, x):
         """
@@ -225,8 +227,8 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
-        colum_cls = None,
-        row_cls = None,
+        colum_cls=None,
+        row_cls=None,
     ):
         super().__init__()
         hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
@@ -265,6 +267,7 @@ def forward(self, x):
         out = self.w3(Silu(w1_o, w2_o))
         return out
 
+
 class FeedForward(BaseFeedForward):
     """
     FeedForward in flash implementation.
@@ -292,9 +295,19 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
     ):
-        super().__init__(in_features, hidden_features, out_features, process_group, bias, device, 
-                         dtype, multiple_of, ColumnParallelLinearTorch, RowParallelLinearTorch)
-       
+        super().__init__(
+            in_features,
+            hidden_features,
+            out_features,
+            process_group,
+            bias,
+            device,
+            dtype,
+            multiple_of,
+            ColumnParallelLinearTorch,
+            RowParallelLinearTorch,
+        )
+
 
 class MegatronFeedForward(BaseFeedForward):
     """
@@ -323,19 +336,35 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
     ):
-        super().__init__(in_features, hidden_features, out_features, process_group, bias, device,
-                         dtype, multiple_of, MegatronColumnParallelLinearTorch, MegatronRowParallelLinearTorch)
+        super().__init__(
+            in_features,
+            hidden_features,
+            out_features,
+            process_group,
+            bias,
+            device,
+            dtype,
+            multiple_of,
+            MegatronColumnParallelLinearTorch,
+            MegatronRowParallelLinearTorch,
+        )
+
 
 class FSTPLinear(ColumnParallelLinear):
     def forward(self, x):
         block_index = gpc.config.fstp_handler.module_to_index[self]
-        name_index = gpc.config.fstp_handler.module_name_index[self]
-        name = gpc.config.fstp_handler.module_name[name_index]
         return fstp_fused_dense_func(
-            x, self.weight, self.bias, process_group=self.process_group, 
-            module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name
+            x,
+            self.weight,
+            self.bias,
+            process_group=self.process_group,
+            module=self,
+            handler=gpc.config.fstp_handler,
+            block_index=block_index,
+            module_name=self._fstp_name,
         )
 
+
 class FSTPFeedForward(BaseFeedForward):
     """
     FeedForward in FSTP.
@@ -363,8 +392,19 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
     ):
-        super().__init__(in_features, hidden_features, out_features, process_group, bias, device,
-                         dtype, multiple_of, FSTPLinear, FSTPLinear)
+        super().__init__(
+            in_features,
+            hidden_features,
+            out_features,
+            process_group,
+            bias,
+            device,
+            dtype,
+            multiple_of,
+            FSTPLinear,
+            FSTPLinear,
+        )
+
 
 def get_mlp_cls(sp_mode: str):
     if sp_mode in ["none", "flash-attn"]:
@@ -375,6 +415,7 @@ def get_mlp_cls(sp_mode: str):
         mlp_cls = FSTPFeedForward
     return mlp_cls
 
+
 def get_linear_cls(sp_mode: str, parallel_mode: str):
     if parallel_mode == "column":
         if sp_mode in ["none", "flash-attn"]:
@@ -383,7 +424,7 @@ def get_linear_cls(sp_mode: str, parallel_mode: str):
             cls = MegatronColumnParallelLinearTorch
         else:
             cls = FSTPLinear
-    elif parallel_mode == 'row':
+    elif parallel_mode == "row":
         if sp_mode in ["none", "flash-attn"]:
             cls = RowParallelLinearTorch
         elif sp_mode == "megatron":
@@ -391,192 +432,3 @@ def get_linear_cls(sp_mode: str, parallel_mode: str):
         else:
             cls = FSTPLinear
     return cls
-
-class CoarseGrainedFSTPAllGatherSyncHandler:
-    """
-    All-gather handler for overlapping the all-gather in adjcent FSTP block.
-    """
-
-    def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
-        # import pdb; pdb.set_trace()
-        self.process_group = process_group
-        self.FSTP_blocks = []
-        self.FSTP_outs = []
-        self.FSTP_modules = []
-        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
-        self.FSTP_global_handle = dict()  # key: FSTP module; value: module global all-gather op handle
-        self.FSTP_global_weights = dict()  # key: FSTP module; value: module global weight for forward
-        self.block_handles = dict()  # key: transformer block; value: all-gather handles
-        self.module_to_index = dict()  # key: FSTP module; value: transformer block index
-        self.block_to_index = dict()  # key: transformer block; value: transformer block index
-        self.index_to_block = dict()  # key: transformer block index; value: transformer block
-        self.index_to_fsdp_modules = dict()  # key: transformer block index; value: fsdp modules
-        self.module_name_index = dict()  # key: FSTP module; value: the name in index in self.module_name
-        self.block_module = dict()  # key: transformer block index; value: {name_index: FSTP module}
-        self.head = []
-        self.embedding = []
-
-        self.reduce_scatter_handlers = {}
-        self.all_reduce_handlers = {}
-        self.zero_const_pool = {}
-
-        # just want to share same for loop for ModuleList and Module
-        if not isinstance(model, nn.ModuleList):
-            model = [model]
-
-        for _chunk in model:
-            if isinstance(_chunk, NaiveAMPModel):
-                _chunk = _chunk.model
-
-            for _chunk_name, children in _chunk.named_children():
-                if isinstance(children, nn.ModuleList):
-                    for idx, block in enumerate(children):
-                        index = 0
-                        self.block_module[idx] = {}
-                        self.FSTP_blocks.append(block)
-                        self.block_to_index[block] = idx
-                        self.index_to_block[idx] = block
-                        self.index_to_fsdp_modules[idx] = []
-                        for _sub_name, sub in block.named_children():
-                            sub_modules = list(sub.children())
-                            if len(sub_modules) > 0:
-                                for name, child in sub.named_children():
-                                    if name == "out_proj":
-                                        self.FSTP_outs.append(child)
-                                        self.module_to_index[child] = idx
-                                    if isinstance(child, FSTPLinear):
-                                        self.module_to_index[child] = idx
-                                        self.block_module[idx][index] = child
-                                        self.FSTP_modules.append(child)
-                                        self.index_to_fsdp_modules[idx].append(child)
-                                        self.module_name_index[child] = index
-                                        index = index + 1
-
-                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
-                                        if child.bias is not None:
-                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
-                            else:
-                                continue
-                elif isinstance(children, ScaleColumnParallelLinear):
-                    self.head.append(children)
-                elif isinstance(children, Embedding1D):
-                    self.embedding.append(children)
-
-    def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
-        if size not in self.zero_const_pool:
-            self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
-
-        return self.zero_const_pool[size]
-
-    def _all_gather_block_weight_memory_pool(self, block_index: int):
-        fsdp_modules = self.index_to_fsdp_modules[block_index]
-        for module in fsdp_modules:
-            module_index = self.module_name_index[module]
-            name = self.module_name[module_index]
-            weight_handle = all_gather_raw_memory_pool(
-                module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name
-            )
-            self.FSTP_global_handle[module] = weight_handle
-
-    def _register_sync_parameters_hook(self) -> None:
-        """
-        register pre_forward_hook and pre_backward_hook for FSTP block.
-
-        Notice that next block's all_gather op should be after current block's all_to_all op, so we
-        1. register pre_forward_hook @out_proj module to prefetch for next block
-        2. register pre_forward_hook @block module to wait handles for next block
-        3. register pre_backward_hook @wqkv module to prefetch for next block
-        4. register pre_backward_hook @block module to wait handles for next block
-        """
-
-        def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
-            block_index = self.module_to_index[module]
-            # start the all-gather for next block
-            if block_index + 1 < gpc.config.NUM_LAYER:
-                self._all_gather_block_weight_memory_pool(block_index + 1)
-
-        def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output):
-            self._all_gather_block_weight_memory_pool(0)
-
-        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
-            handle = self.FSTP_global_handle[module]
-            handle.wait()
-
-        def _post_forward_hook_for_module(module: nn.Module, input, output):
-            if module in self.FSTP_global_weights:
-                del self.FSTP_global_weights[module]
-            if module in self.FSTP_global_handle:
-                del self.FSTP_global_handle[module]
-
-        def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
-            first_module = self.block_module[gpc.config.NUM_LAYER - 1][4]
-            total_weight, weight_handler = all_gather_raw(first_module.weight, self.process_group, async_op=True)
-            self.FSTP_global_handle[first_module] = weight_handler
-            self.FSTP_global_weights[first_module] = total_weight
-
-        def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output):
-            block_index = self.module_to_index[module]
-            name_index = self.module_name_index[module]
-
-            if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1:
-                weight_handler = self.FSTP_global_handle[module]
-                weight_handler.wait()
-
-                # start the all-gather for next module
-                next_module = self.block_module[block_index][name_index - 1]
-                next_name = self.module_name[name_index - 1]
-                weights_handler = all_gather_raw_memory_pool(
-                    next_module.weight,
-                    self.process_group,
-                    async_op=True,
-                    block_index=block_index,
-                    module_name=next_name,
-                )
-                self.FSTP_global_handle[next_module] = weights_handler
-            elif name_index == 0:
-                handler = self.FSTP_global_handle[module]
-                handler.wait()
-
-                if block_index - 1 >= 0:
-                    next_module = self.block_module[block_index - 1][4]
-                    name = self.module_name[4]
-                    weights_handler = all_gather_raw_memory_pool(
-                        next_module.weight,
-                        self.process_group,
-                        async_op=True,
-                        block_index=block_index - 1,
-                        module_name=name,
-                    )
-                    self.FSTP_global_handle[next_module] = weights_handler
-            else:
-                handler = self.FSTP_global_handle[module]
-                handler.wait()
-                if name_index != 0:
-                    next_module = self.block_module[block_index][name_index - 1]
-                    name = self.module_name[name_index - 1]
-                    weights_handler = all_gather_raw_memory_pool(
-                        next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name
-                    )
-                    self.FSTP_global_handle[next_module] = weights_handler
-
-        def _post_backward_hook_for_module(module, grad_input, grad_output):
-            if module in self.FSTP_global_weights:
-                del self.FSTP_global_weights[module]
-            if module in self.FSTP_global_handle:
-                del self.FSTP_global_handle[module]
-
-        for embedding in self.embedding:
-            embedding.register_forward_hook(_post_forward_hook_for_embedding)
-
-        for head in self.head:
-            head.register_full_backward_hook(_post_backward_hook_for_head)
-
-        for out_proj in self.FSTP_outs:
-            out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
-
-        for module in self.FSTP_modules:
-            module.register_forward_pre_hook(_pre_forward_hook_for_module)
-            module.register_forward_hook(_post_forward_hook_for_module)
-            module.register_full_backward_pre_hook(_pre_backward_hook_for_module_memory_pool)
-            module.register_full_backward_hook(_post_backward_hook_for_module)
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 3ed78d79..228e1e1c 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -14,12 +14,9 @@
 from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
-    FeedForward,
-    MegatronFeedForward,
-    FSTPFeedForward,
+    MegatronScaleColumnParallelLinear,
     RewardModelLinear,
     ScaleColumnParallelLinear,
-    MegatronScaleColumnParallelLinear,
     get_mlp_cls,
 )
 from internlm.model.multi_head_attention import MHA
@@ -309,7 +306,11 @@ def __init__(
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            head_cls = ScaleColumnParallelLinear if self.sp_mode in ["flash-attn", "none", "intern"] else MegatronScaleColumnParallelLinear
+            head_cls = (
+                ScaleColumnParallelLinear
+                if self.sp_mode in ["flash-attn", "none", "intern"]
+                else MegatronScaleColumnParallelLinear
+            )
         if first:
             if embed_split_hidden:
                 self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 8ba49edd..93dbf010 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -38,14 +38,7 @@
 from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding
-from internlm.model.linear import (
-    ColumnParallelLinearTorch,
-    FSTPLinear,
-    RowParallelLinearTorch,
-    MegatronColumnParallelLinearTorch,
-    MegatronRowParallelLinearTorch,
-    get_linear_cls,
-)
+from internlm.model.linear import get_linear_cls
 
 
 # adpated from https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py
@@ -227,7 +220,7 @@ def __init__(
             self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group)
 
         # output projection always have the bias (for now)
-        out_proj_cls = get_linear_cls(sp_mode, 'row')
+        out_proj_cls = get_linear_cls(sp_mode, "row")
         self.out_proj = out_proj_cls(
             embed_dim,
             embed_dim,
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
new file mode 100644
index 00000000..cafb8183
--- /dev/null
+++ b/internlm/model/overlap_handler.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from typing import Any, Union
+
+import torch
+from torch import nn
+
+from internlm.core.context import global_context as gpc
+from internlm.core.naive_amp import NaiveAMPModel
+from internlm.model.embedding import Embedding1D
+from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
+from internlm.model.utils import all_gather_raw_memory_pool
+from internlm.utils.common import get_current_device
+
+
+class FSTPOverlapHandler:
+    """
+    FSTP overlap handler for managing the all-gather and reduce_scatter overlapping.
+    """
+
+    def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
+        self.process_group = process_group
+        self.fstp_outs = []
+        self.fstp_modules = []
+        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
+        self.fstp_global_handle = dict()  # key: fstp module; value: module global all-gather op handle
+        self.module_to_index = dict()  # key: fstp module; value: transformer block index
+        self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
+        self.head = []
+        self.embedding = []
+
+        self.reduce_scatter_handlers = {}
+        self.zero_const_pool = {}
+
+        # just want to share same for loop for ModuleList and Module
+        if not isinstance(model, nn.ModuleList):
+            model = [model]
+
+        for _chunk in model:
+            if isinstance(_chunk, NaiveAMPModel):
+                _chunk = _chunk.model
+
+            for _chunk_name, children in _chunk.named_children():
+                if isinstance(children, ScaleColumnParallelLinear):
+                    self.head.append(children)
+                elif isinstance(children, Embedding1D):
+                    self.embedding.append(children)
+                elif isinstance(children, nn.ModuleList):
+                    for idx, block in enumerate(children):
+                        self.index_to_fstp_modules[idx] = []
+                        for _sub_name, sub in block.named_children():
+                            sub_modules = list(sub.children())
+                            if len(sub_modules) > 0:
+                                for name, child in sub.named_children():
+                                    if name == "out_proj":
+                                        self.fstp_outs.append(child)
+                                        self.module_to_index[child] = idx
+                                    if isinstance(child, FSTPLinear):
+                                        self.module_to_index[child] = idx
+                                        self.fstp_modules.append(child)
+                                        self.index_to_fstp_modules[idx].append(child)
+
+                                        setattr(child, "_fstp_name", name)
+
+                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
+                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
+                                        if child.bias is not None:
+                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
+
+        self._initialize_memory_pool()
+        self._register_sync_parameters_hook()
+
+    def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
+        if size not in self.zero_const_pool:
+            self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
+
+        return self.zero_const_pool[size]
+
+    def _initialize_memory_pool(self) -> None:
+        # allocate memory pool
+        hidden_size = gpc.config.HIDDEN_SIZE
+        mlp_ratio = gpc.config.MLP_RATIO
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
+        self.all_gather_memory_pool = []
+        self.reduce_scatter_memory_pool = {}
+
+        for _ in range(2):
+            weight = {}
+            for name in self.module_name:
+                if name == "Wqkv":
+                    weight[name] = torch.zeros(
+                        (3 * hidden_size, hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
+                elif name == "out_proj":
+                    weight[name] = torch.zeros(
+                        (hidden_size, hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
+                elif name == "w1" or name == "w2":
+                    weight[name] = torch.zeros(
+                        (mlp_hidden_size, hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
+                else:
+                    weight[name] = torch.zeros(
+                        (hidden_size, mlp_hidden_size),
+                        dtype=gpc.config.model.get("dtype", torch.half),
+                        device=get_current_device(),
+                    ).contiguous()
+
+            self.all_gather_memory_pool.append(weight)  # containing two groups of block weight
+
+    def get_all_gather_memory(self, index, module_name):
+        return self.all_gather_memory_pool[index % 2][module_name]
+
+    def get_reduce_scatter_memory(self, key):
+        return_idx = 0
+
+        # if key not in dict
+        if key not in self.reduce_scatter_memory_pool:
+            self.reduce_scatter_memory_pool[key] = {"data": [], "used": []}
+
+        # if the data is empty
+        if len(self.reduce_scatter_memory_pool[key]["data"]) == 0:
+            self.reduce_scatter_memory_pool[key]["data"].append(
+                torch.zeros(
+                    key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()
+                ).contiguous()
+            )
+            self.reduce_scatter_memory_pool[key]["used"].append(True)
+            return_idx = 0
+            return return_idx
+        else:  # if not empty
+            for index, used in enumerate(self.reduce_scatter_memory_pool[key]["used"]):
+                if used is False:
+                    self.reduce_scatter_memory_pool[key]["used"][index] = True
+                    return_idx = index
+                    return return_idx
+            # if the memory pool is all used
+            length = len(self.reduce_scatter_memory_pool[key]["data"])
+            self.reduce_scatter_memory_pool[key]["data"].append(
+                torch.zeros(
+                    key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()
+                ).contiguous()
+            )
+            self.reduce_scatter_memory_pool[key]["used"].append(True)
+            return_idx = length
+            return return_idx
+
+    def release_reduce_scatter_memory(self, size, index):
+        self.reduce_scatter_memory_pool[size]["used"][index] = False
+
+    def _all_gather_block_weight_memory_pool(self, block_index: int):
+        fstp_modules = self.index_to_fstp_modules[block_index]
+        for module in fstp_modules:
+            weight_handle = all_gather_raw_memory_pool(
+                module.weight,
+                self.process_group,
+                async_op=True,
+                block_index=block_index,
+                module_name=getattr(module, "_fstp_name"),
+            )
+            self.fstp_global_handle[module] = weight_handle
+
+    def _register_sync_parameters_hook(self) -> None:
+        """
+        register forward hooks and backward hooks for fstp modules.
+        """
+
+        def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any):
+            self._all_gather_block_weight_memory_pool(0)
+
+        def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
+            block_index = self.module_to_index[module]
+            # start the all-gather for next block
+            if block_index + 1 < gpc.config.NUM_LAYER:
+                self._all_gather_block_weight_memory_pool(block_index + 1)
+
+        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
+            handle = self.fstp_global_handle[module]
+            handle.wait()
+
+        def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):
+            if module in self.fstp_global_handle:
+                del self.fstp_global_handle[module]
+
+        def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
+            first_backward_module = self.fstp_modules[-1]
+            block_index = self.module_to_index[first_backward_module]
+            weight_handle = all_gather_raw_memory_pool(
+                first_backward_module.weight,
+                self.process_group,
+                async_op=True,
+                block_index=block_index,
+                module_name=getattr(first_backward_module, "_fstp_name"),
+            )
+            self.fstp_global_handle[first_backward_module] = weight_handle
+
+        def _pre_backward_hook_for_module(module: nn.Module, grad_output):
+            # wait handle for current module
+            weight_handle = self.fstp_global_handle[module]
+            weight_handle.wait()
+
+            # start the all-gather for next module
+            module_index = self.fstp_modules.index(module)
+            if module_index - 1 >= 0:
+                next_module = self.fstp_modules[module_index - 1]
+                block_index = self.module_to_index[next_module]
+                weight_handle = all_gather_raw_memory_pool(
+                    next_module.weight,
+                    self.process_group,
+                    async_op=True,
+                    block_index=block_index,
+                    module_name=getattr(next_module, "_fstp_name"),
+                )
+                self.fstp_global_handle[next_module] = weight_handle
+
+        def _post_backward_hook_for_module(module, grad_input, grad_output):
+            if module in self.fstp_global_handle:
+                del self.fstp_global_handle[module]
+
+        # register forward hooks
+        # 1. register post_forward_hook @embedding module to prefetch for block 0
+        # 2. register pre_forward_hook @out_proj module to prefetch for next block,
+        #    notice that next block's all_gather op should be after current block's all_to_all op
+        # 3. register pre_forward_hook @fstp_module to wait handle for current module
+        # 4. register post_forward_hook @fstp_module to release resource
+        for embedding in self.embedding:
+            embedding.register_forward_hook(_post_forward_hook_for_embedding)
+
+        for out_proj in self.fstp_outs:
+            out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
+
+        for module in self.fstp_modules:
+            module.register_forward_pre_hook(_pre_forward_hook_for_module)
+            module.register_forward_hook(_post_forward_hook_for_module)
+
+        # register backward hooks
+        # 1. register post_backward_hook @head module to prefetch for the last block's last module
+        # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module
+        # 3. register post_backward_hook @fstp_module to release resource
+        for head in self.head:
+            head.register_full_backward_hook(_post_backward_hook_for_head)
+
+        for module in self.fstp_modules:
+            module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
+            module.register_full_backward_hook(_post_backward_hook_for_module)
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index b1894e9f..ccdca481 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -135,7 +135,7 @@ def all_gather_raw_memory_pool(
     module_name: str = None,
 ):
     handle = torch.distributed.all_gather_into_tensor(
-        gpc.config.block_memory[block_index % 2][module_name],
+        gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name),
         input_.contiguous(),
         group=process_group,
         async_op=async_op,
@@ -166,8 +166,8 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup,
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
     size = (input_.shape[0] // world_size, *input_.shape[1:])
-    index = check_reduce_scatter_memory_pool(size)
-    output = gpc.config.reduce_scatter_memory[size]["data"][index]
+    index = gpc.config.fstp_handler.get_reduce_scatter_memory(size)
+    output = gpc.config.fstp_handler.reduce_scatter_memory_pool[size]["data"][index]
     setattr(output, "index", index)
     handle = torch.distributed.reduce_scatter_tensor(
         output, input_.contiguous(), group=process_group, async_op=async_op
@@ -269,11 +269,11 @@ def backward(ctx, grad_output, *args):
 
 
 class MegatronFusedDenseFunc(torch.autograd.Function):
-    '''
+    """
     FusedDenseFunc for tensor parallel in megatron implementation.
     The diffenrence between the implementation of flash-attn and megatron is that the total_x could be saved for backward in megatron,
     so that the all-gather in backward is ommited.
-    '''
+    """
 
     @staticmethod
     @custom_fwd
@@ -355,9 +355,10 @@ def backward(ctx, grad_output, *args):
             handle_grad_input.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
+
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFuncTorch(FusedDenseFunc):
-    '''FusedDenseFunc in flash implementation for supporting torch.float32'''
+    """FusedDenseFunc in flash implementation for supporting torch.float32"""
 
     @staticmethod
     @custom_bwd
@@ -407,8 +408,9 @@ def backward(ctx, grad_output, *args):
             handle_grad_input.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
+
 class MegatronFusedDenseFuncTorch(FusedDenseFunc):
-    '''FusedDenseFunc in megatron implementation for supporting torch.float32'''
+    """FusedDenseFunc in megatron implementation for supporting torch.float32"""
 
     @staticmethod
     @custom_bwd
@@ -452,6 +454,7 @@ def backward(ctx, grad_output, *args):
             handle_grad_input.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
+
 class FSTPFusedDenseFunc(torch.autograd.Function):
     "FusedDenseFunc for FSTP, which is optimized based on flash implementation."
 
@@ -485,7 +488,7 @@ def forward(
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
             if overlap_handler is not None:
-                total_weight = gpc.config.block_memory[block_index % 2][module_name]   
+                total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name)
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
@@ -544,7 +547,7 @@ def backward(ctx, grad_output, *args):
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             if overlap_handler is not None:
-                total_weight = gpc.config.block_memory[block_index % 2][module_name]
+                total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name)
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
@@ -559,17 +562,39 @@ def backward(ctx, grad_output, *args):
             )
             if world_size > 1:
                 if overlap_handler is not None:
-                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True)
+                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
+                        grad_weight, process_group, async_op=True
+                    )
                     assert hasattr(weight, "_fstp_reduce_scatter_str")
-                    overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
-                    grad_weight = overlap_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
+                    overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (
+                        handle_grad_weight,
+                        grad_weight_async,
+                    )
+                    grad_weight = overlap_handler.get_zero_by_shape(
+                        (
+                            grad_weight.shape[0] // torch.distributed.get_world_size(process_group),
+                            *grad_weight.shape[1:],
+                        ),
+                        dtype=grad_weight.dtype,
+                        device=grad_weight.device,
+                    )
                     if grad_bias is not None:
                         grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(
                             grad_bias, process_group, async_op=True
                         )
                         assert hasattr(bias, "_fstp_reduce_scatter_str")
-                        overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
-                        grad_bias = overlap_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
+                        overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (
+                            handle_grad_bias,
+                            grad_bias_async,
+                        )
+                        grad_bias = overlap_handler.get_zero_by_shape(
+                            (
+                                grad_bias.shape[0] // torch.distributed.get_world_size(process_group),
+                                *grad_bias.shape[1:],
+                            ),
+                            dtype=grad_bias.dtype,
+                            device=grad_bias.device,
+                        )
                 else:
                     grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
                     if grad_bias is not None:
@@ -613,6 +638,7 @@ def fused_dense_func_torch(
     else:
         return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
 
+
 def megatron_fused_dense_func_torch(
     x: Tensor,
     weight: Tensor,
@@ -626,9 +652,14 @@ def megatron_fused_dense_func_torch(
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return MegatronFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
+        return MegatronFusedDenseFunc.apply(
+            x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim
+        )
     else:
-        return MegatronFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
+        return MegatronFusedDenseFuncTorch.apply(
+            x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim
+        )
+
 
 def fstp_fused_dense_func(
     x: Tensor,
@@ -693,38 +724,3 @@ def Silu(w1_o, w2_o):
 
 
 Silu = torch.jit.script(Silu)
-
-
-def check_reduce_scatter_memory_pool(key):
-    return_idx = 0
-
-    # if key not in dict
-    if key not in gpc.config.reduce_scatter_memory:
-        gpc.config.reduce_scatter_memory[key] = {"data": [], "used": []}
-
-    # if the data is empty
-    if len(gpc.config.reduce_scatter_memory[key]["data"]) == 0:
-        gpc.config.reduce_scatter_memory[key]["data"].append(
-            torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous()
-        )
-        gpc.config.reduce_scatter_memory[key]["used"].append(True)
-        return_idx = 0
-        return return_idx
-    else:  # if not empty
-        for index, used in enumerate(gpc.config.reduce_scatter_memory[key]["used"]):
-            if used is False:
-                gpc.config.reduce_scatter_memory[key]["used"][index] = True
-                return_idx = index
-                return return_idx
-        # if the memory pool is all used
-        length = len(gpc.config.reduce_scatter_memory[key]["data"])
-        gpc.config.reduce_scatter_memory[key]["data"].append(
-            torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous()
-        )
-        gpc.config.reduce_scatter_memory[key]["used"].append(True)
-        return_idx = length
-        return return_idx
-
-
-def release_reduce_scatter_memory_pool(size, index):
-    gpc.config.reduce_scatter_memory[size]["used"][index] = False
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 0f536ec5..e2ec7efd 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -11,7 +11,6 @@
 
 from internlm.core.context import Config, ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import release_reduce_scatter_memory_pool
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
     BucketStore,
@@ -41,6 +40,7 @@
 inf = math.inf
 logger = get_logger(__file__)
 
+
 class HybridZeroOptimizer(BaseOptimizer):
     """
     Hybrid Zero Optimizer.
@@ -65,7 +65,7 @@ def __init__(
         backoff_factor = grad_scal_cfg.backoff_factor
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
-        
+
         self._fstp_handler = None
         if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
             self._fstp_handler = gpc.config.fstp_handler
@@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona
             _param.grad.add_(_grad)
 
             # release cuda memory.
-            release_reduce_scatter_memory_pool(size=tuple(_grad.size()), index=_grad.index)
+            gpc.config.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index)
             self._fstp_handler.reduce_scatter_handlers[_key] = None
 
         bucket.reset_by_rank(reduce_rank)
@@ -635,9 +635,9 @@ def step(self, closure=None):
         timer("sync_grad").start()
         self._sync_grad()
         timer("sync_grad").stop()
-        
-        res =  self._step(closure=closure, norms=total_norms)
-            
+
+        res = self._step(closure=closure, norms=total_norms)
+
         return res
 
     def _step(self, closure=None, norms=None):
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 53996b38..cabb7ebd 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -36,12 +36,12 @@
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
-    CoarseGrainedFSTPAllGatherSyncHandler,
     FeedForward,
     RewardModelLinear,
     ScaleColumnParallelLinear,
 )
 from internlm.model.multi_head_attention import MHA
+from internlm.model.overlap_handler import FSTPOverlapHandler
 from internlm.model.utils import try_import_RMSNorm
 from internlm.monitor import send_heartbeat, set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
@@ -109,60 +109,8 @@ def initialize_model():
     model = wrap_FSDP_model(model)
 
     gpc.config.fstp_handler = None
-
     if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
-        handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR))
-        handler._register_sync_parameters_hook()
-        gpc.config.fstp_handler = handler
-
-        # allocate memory pool
-        block_memory = {}  # containing two groups of block weight
-        hidden_size = gpc.config.HIDDEN_SIZE
-        mlp_ratio = gpc.config.MLP_RATIO
-        mlp_hidden_size = int(hidden_size * mlp_ratio)
-        mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
-        world_size = gpc.get_world_size(ParallelMode.TENSOR)
-        size_key = [
-            (3 * hidden_size // world_size, hidden_size),
-            (mlp_hidden_size // world_size, hidden_size),
-            (hidden_size // world_size, mlp_hidden_size),
-            (hidden_size // world_size, hidden_size),
-        ]
-        module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
-        for i in range(2):
-            weight = {}
-            for name in module_name:
-                if name == "Wqkv":
-                    weight[name] = torch.zeros(
-                        (3 * hidden_size, hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-                elif name == "out_proj":
-                    weight[name] = torch.zeros(
-                        (hidden_size, hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-                elif name == "w1" or name == "w2":
-                    weight[name] = torch.zeros(
-                        (mlp_hidden_size, hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-                else:
-                    weight[name] = torch.zeros(
-                        (hidden_size, mlp_hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-            block_memory[i] = weight
-        reduce_scatter_memory = {}
-        for key in size_key:
-            reduce_scatter_memory[key] = {"data": [], "used": []}
-
-        gpc.config.block_memory = block_memory
-        gpc.config.reduce_scatter_memory = reduce_scatter_memory
+        gpc.config.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR))
 
     return model
 
diff --git a/train.py b/train.py
index 02f28028..5066960e 100644
--- a/train.py
+++ b/train.py
@@ -299,7 +299,7 @@ def main(args):
 
             if gpc.config.fstp_handler is not None:
                 gpc.config.fstp_handler.zero_const_pool = {}
-                gpc.config.fstp_handler.reduce_scatter_memory = {}
+                gpc.config.fstp_handler.reduce_scatter_memory_pool = {}
             # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 

From b20f47a1fe5fb446f2d9df5a83b31cb6033579f0 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 23 Oct 2023 12:02:32 +0800
Subject: [PATCH 042/153] feat(model/overlap_handler.py): move handler to gpc

---
 internlm/model/linear.py                      |  5 +---
 internlm/model/overlap_handler.py             | 16 ++++------
 internlm/model/utils.py                       | 29 ++++++-------------
 .../solver/optimizer/hybrid_zero_optim.py     |  4 +--
 internlm/train/training_internlm.py           |  4 +--
 train.py                                      |  6 ++--
 6 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 6cd3b9c8..b92b2ee5 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -352,16 +352,13 @@ def __init__(
 
 class FSTPLinear(ColumnParallelLinear):
     def forward(self, x):
-        block_index = gpc.config.fstp_handler.module_to_index[self]
         return fstp_fused_dense_func(
             x,
             self.weight,
             self.bias,
             process_group=self.process_group,
             module=self,
-            handler=gpc.config.fstp_handler,
-            block_index=block_index,
-            module_name=self._fstp_name,
+            handler=gpc.fstp_handler,
         )
 
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index cafb8183..b6877234 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -116,8 +116,9 @@ def _initialize_memory_pool(self) -> None:
 
             self.all_gather_memory_pool.append(weight)  # containing two groups of block weight
 
-    def get_all_gather_memory(self, index, module_name):
-        return self.all_gather_memory_pool[index % 2][module_name]
+    def get_all_gather_memory(self, module):
+        block_index = self.module_to_index[module]
+        return self.all_gather_memory_pool[block_index % 2][module._fstp_name]
 
     def get_reduce_scatter_memory(self, key):
         return_idx = 0
@@ -163,8 +164,7 @@ def _all_gather_block_weight_memory_pool(self, block_index: int):
                 module.weight,
                 self.process_group,
                 async_op=True,
-                block_index=block_index,
-                module_name=getattr(module, "_fstp_name"),
+                module=module,
             )
             self.fstp_global_handle[module] = weight_handle
 
@@ -192,13 +192,11 @@ def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):
 
         def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
             first_backward_module = self.fstp_modules[-1]
-            block_index = self.module_to_index[first_backward_module]
             weight_handle = all_gather_raw_memory_pool(
                 first_backward_module.weight,
                 self.process_group,
                 async_op=True,
-                block_index=block_index,
-                module_name=getattr(first_backward_module, "_fstp_name"),
+                module=first_backward_module,
             )
             self.fstp_global_handle[first_backward_module] = weight_handle
 
@@ -211,13 +209,11 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output):
             module_index = self.fstp_modules.index(module)
             if module_index - 1 >= 0:
                 next_module = self.fstp_modules[module_index - 1]
-                block_index = self.module_to_index[next_module]
                 weight_handle = all_gather_raw_memory_pool(
                     next_module.weight,
                     self.process_group,
                     async_op=True,
-                    block_index=block_index,
-                    module_name=getattr(next_module, "_fstp_name"),
+                    module=next_module,
                 )
                 self.fstp_global_handle[next_module] = weight_handle
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index ccdca481..cdbed954 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -7,13 +7,12 @@
 import torch
 import torch.nn.functional as F
 from flash_attn.utils.distributed import all_reduce_raw
-from torch import Tensor
+from torch import Tensor, nn
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 
 logger = get_logger(__file__)
@@ -131,11 +130,10 @@ def all_gather_raw_memory_pool(
     process_group: ProcessGroup,
     async_op: bool = False,
     gather_dim: int = 0,
-    block_index: int = None,
-    module_name: str = None,
+    module: nn.Module = None,
 ):
     handle = torch.distributed.all_gather_into_tensor(
-        gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name),
+        gpc.fstp_handler.get_all_gather_memory(module=module),
         input_.contiguous(),
         group=process_group,
         async_op=async_op,
@@ -166,8 +164,8 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup,
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
     size = (input_.shape[0] // world_size, *input_.shape[1:])
-    index = gpc.config.fstp_handler.get_reduce_scatter_memory(size)
-    output = gpc.config.fstp_handler.reduce_scatter_memory_pool[size]["data"][index]
+    index = gpc.fstp_handler.get_reduce_scatter_memory(size)
+    output = gpc.fstp_handler.reduce_scatter_memory_pool[size]["data"][index]
     setattr(output, "index", index)
     handle = torch.distributed.reduce_scatter_tensor(
         output, input_.contiguous(), group=process_group, async_op=async_op
@@ -469,16 +467,12 @@ def forward(
         process_group=None,
         module=None,
         overlap_handler=None,
-        block_index=None,
-        module_name=None,
     ):
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
         ctx.process_group = process_group
         ctx.overlap_handler = overlap_handler
         ctx.module = module
-        ctx.block_index = block_index
-        ctx.module_name = module_name
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
@@ -488,7 +482,7 @@ def forward(
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
             if overlap_handler is not None:
-                total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name)
+                total_weight = gpc.fstp_handler.get_all_gather_memory(module=module)
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
@@ -531,8 +525,7 @@ def backward(ctx, grad_output, *args):
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
         overlap_handler = ctx.overlap_handler
-        block_index = ctx.block_index
-        module_name = ctx.module_name
+        module = ctx.module
 
         if ctx.compute_weight_gradient:
             x, weight, bias = ctx.saved_tensors
@@ -547,7 +540,7 @@ def backward(ctx, grad_output, *args):
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             if overlap_handler is not None:
-                total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name)
+                total_weight = gpc.fstp_handler.get_all_gather_memory(module=module)
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
@@ -669,16 +662,12 @@ def fstp_fused_dense_func(
     process_group=None,
     module=None,
     handler=None,
-    block_index=None,
-    module_name=None,
 ):
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FSTPFusedDenseFunc.apply(
-            x, weight, bias, return_residual, process_group, module, handler, block_index, module_name
-        )
+        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler)
     else:
         assert process_group is None
         out = F.linear(x, weight, bias)
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index e2ec7efd..08d97229 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -68,7 +68,7 @@ def __init__(
 
         self._fstp_handler = None
         if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
-            self._fstp_handler = gpc.config.fstp_handler
+            self._fstp_handler = gpc.fstp_handler
 
         # Zero related args
         reduce_bucket_size = zero_cfg.reduce_bucket_size
@@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona
             _param.grad.add_(_grad)
 
             # release cuda memory.
-            gpc.config.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index)
+            gpc.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index)
             self._fstp_handler.reduce_scatter_handlers[_key] = None
 
         bucket.reset_by_rank(reduce_rank)
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index cabb7ebd..b05611bc 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -108,9 +108,9 @@ def initialize_model():
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
 
-    gpc.config.fstp_handler = None
+    gpc.fstp_handler = None
     if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
-        gpc.config.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR))
+        gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR))
 
     return model
 
diff --git a/train.py b/train.py
index 5066960e..96dc24d1 100644
--- a/train.py
+++ b/train.py
@@ -297,9 +297,9 @@ def main(args):
 
             prof.step()
 
-            if gpc.config.fstp_handler is not None:
-                gpc.config.fstp_handler.zero_const_pool = {}
-                gpc.config.fstp_handler.reduce_scatter_memory_pool = {}
+            if gpc.fstp_handler is not None:
+                gpc.fstp_handler.zero_const_pool = {}
+                gpc.fstp_handler.reduce_scatter_memory_pool = {}
             # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 

From e7f9f1d20853e856f175d178bf94350871744b67 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 23 Oct 2023 13:31:23 +0800
Subject: [PATCH 043/153] feat(model/overlap_handler.py): optimize reduce
 scatter mem pool

---
 internlm/model/overlap_handler.py             | 35 ++++++++++---------
 internlm/model/utils.py                       |  4 +--
 .../solver/optimizer/hybrid_zero_optim.py     |  2 +-
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index b6877234..b3c8b8b0 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -125,37 +125,38 @@ def get_reduce_scatter_memory(self, key):
 
         # if key not in dict
         if key not in self.reduce_scatter_memory_pool:
-            self.reduce_scatter_memory_pool[key] = {"data": [], "used": []}
+            self.reduce_scatter_memory_pool[key] = []
 
         # if the data is empty
-        if len(self.reduce_scatter_memory_pool[key]["data"]) == 0:
-            self.reduce_scatter_memory_pool[key]["data"].append(
+        if len(self.reduce_scatter_memory_pool[key]) == 0:
+            self.reduce_scatter_memory_pool[key].append(
                 torch.zeros(
                     key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()
                 ).contiguous()
             )
-            self.reduce_scatter_memory_pool[key]["used"].append(True)
-            return_idx = 0
-            return return_idx
+            setattr(self.reduce_scatter_memory_pool[key][return_idx], "idle", False)
+            setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx)
+            return self.reduce_scatter_memory_pool[key][return_idx]
         else:  # if not empty
-            for index, used in enumerate(self.reduce_scatter_memory_pool[key]["used"]):
-                if used is False:
-                    self.reduce_scatter_memory_pool[key]["used"][index] = True
+            for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]):
+                if mem_item.idle is True:
+                    self.reduce_scatter_memory_pool[key][index].idle = False
                     return_idx = index
-                    return return_idx
+                    return self.reduce_scatter_memory_pool[key][return_idx]
             # if the memory pool is all used
-            length = len(self.reduce_scatter_memory_pool[key]["data"])
-            self.reduce_scatter_memory_pool[key]["data"].append(
+            cur_len = len(self.reduce_scatter_memory_pool[key])
+            self.reduce_scatter_memory_pool[key].append(
                 torch.zeros(
                     key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()
                 ).contiguous()
             )
-            self.reduce_scatter_memory_pool[key]["used"].append(True)
-            return_idx = length
-            return return_idx
+            setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False)
+            return_idx = cur_len
+            setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx)
+            return self.reduce_scatter_memory_pool[key][return_idx]
 
-    def release_reduce_scatter_memory(self, size, index):
-        self.reduce_scatter_memory_pool[size]["used"][index] = False
+    def release_reduce_scatter_memory(self, key, index):
+        self.reduce_scatter_memory_pool[key][index].idle = True
 
     def _all_gather_block_weight_memory_pool(self, block_index: int):
         fstp_modules = self.index_to_fstp_modules[block_index]
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index cdbed954..8070cbdc 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -164,9 +164,7 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup,
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
     size = (input_.shape[0] // world_size, *input_.shape[1:])
-    index = gpc.fstp_handler.get_reduce_scatter_memory(size)
-    output = gpc.fstp_handler.reduce_scatter_memory_pool[size]["data"][index]
-    setattr(output, "index", index)
+    output = gpc.fstp_handler.get_reduce_scatter_memory(size)
     handle = torch.distributed.reduce_scatter_tensor(
         output, input_.contiguous(), group=process_group, async_op=async_op
     )
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 08d97229..0d0c8a3b 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona
             _param.grad.add_(_grad)
 
             # release cuda memory.
-            gpc.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index)
+            gpc.fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
             self._fstp_handler.reduce_scatter_handlers[_key] = None
 
         bucket.reset_by_rank(reduce_rank)

From f6a5086fe4203727ed96ce4444493a080d91b74d Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 23 Oct 2023 14:51:27 +0800
Subject: [PATCH 044/153] support bias

---
 internlm/model/overlap_handler.py | 85 ++++++++++++++++++++-----------
 internlm/model/utils.py           | 22 +++++++-
 2 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index b3c8b8b0..f7132c3b 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -10,7 +10,7 @@
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
-from internlm.model.utils import all_gather_raw_memory_pool
+from internlm.model.utils import all_gather_raw_memory_pool, all_gather_raw_bias_memory_pool
 from internlm.utils.common import get_current_device
 
 
@@ -25,6 +25,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.fstp_modules = []
         self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
         self.fstp_global_handle = dict()  # key: fstp module; value: module global all-gather op handle
+        self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle
         self.module_to_index = dict()  # key: fstp module; value: transformer block index
         self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
         self.head = []
@@ -76,49 +77,61 @@ def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
             self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
 
         return self.zero_const_pool[size]
-
-    def _initialize_memory_pool(self) -> None:
-        # allocate memory pool
+    
+    def _initialize_module_shape(self):
         hidden_size = gpc.config.HIDDEN_SIZE
         mlp_ratio = gpc.config.MLP_RATIO
         mlp_hidden_size = int(hidden_size * mlp_ratio)
         mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
+        
+        self.module_shape["Wqkv"] = (3 * hidden_size, hidden_size)
+        self.module_shape["out_proj"] = (hidden_size, hidden_size)
+        self.module_shape["w1"] = (mlp_hidden_size, hidden_size)
+        self.module_shape["w2"] = (mlp_hidden_size, hidden_size)
+        self.module_shape["w3"] = (hidden_size, mlp_hidden_size)
+
+    def _initialize_memory_pool(self) -> None:
+        # allocate memory pool
         self.all_gather_memory_pool = []
+        self.all_gather_bias_memory_pool = []
         self.reduce_scatter_memory_pool = {}
+        self.module_shape = {}
+        
+        self._initialize_module_shape()
+        dtype = gpc.config.model.get("dtype", torch.half)
+        device = get_current_device()
 
         for _ in range(2):
             weight = {}
             for name in self.module_name:
-                if name == "Wqkv":
-                    weight[name] = torch.zeros(
-                        (3 * hidden_size, hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-                elif name == "out_proj":
-                    weight[name] = torch.zeros(
-                        (hidden_size, hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-                elif name == "w1" or name == "w2":
-                    weight[name] = torch.zeros(
-                        (mlp_hidden_size, hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-                else:
-                    weight[name] = torch.zeros(
-                        (hidden_size, mlp_hidden_size),
-                        dtype=gpc.config.model.get("dtype", torch.half),
-                        device=get_current_device(),
-                    ).contiguous()
-
+                weight[name] = torch.zeros(self.module_shape[name], dtype=dtype, device=device).contiguous()
             self.all_gather_memory_pool.append(weight)  # containing two groups of block weight
 
     def get_all_gather_memory(self, module):
         block_index = self.module_to_index[module]
         return self.all_gather_memory_pool[block_index % 2][module._fstp_name]
+    
+    def get_bias_memory(self, module: nn.Module):
+        block_index = self.module_to_index[module]
+        # if the bias memory pool is empty or module has been not allocated memory
+        # import pdb; pdb.set_trace()
+        if len(self.all_gather_bias_memory_pool) == 0:
+            for _ in range(2):
+                weight = {}
+                weight[module._fstp_name] = torch.zeros(
+                                                    self.module_shape[module._fstp_name][0], 
+                                                    dtype=gpc.config.model.get("dtype", torch.half),
+                                                    device=get_current_device()).contiguous()
+                self.all_gather_bias_memory_pool.append(weight)
+        elif module._fstp_name not in self.all_gather_bias_memory_pool[0]:
+            for i in range(2):
+                self.all_gather_bias_memory_pool[i][module._fstp_name] = torch.zeros(
+                                                    self.module_shape[module._fstp_name][0], 
+                                                    dtype=gpc.config.model.get("dtype", torch.half),
+                                                    device=get_current_device()).contiguous()
+        
+        return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
+                
 
     def get_reduce_scatter_memory(self, key):
         return_idx = 0
@@ -157,10 +170,19 @@ def get_reduce_scatter_memory(self, key):
 
     def release_reduce_scatter_memory(self, key, index):
         self.reduce_scatter_memory_pool[key][index].idle = True
-
+   
     def _all_gather_block_weight_memory_pool(self, block_index: int):
         fstp_modules = self.index_to_fstp_modules[block_index]
         for module in fstp_modules:
+            if module.bias is not None:
+                bias_handle = all_gather_raw_bias_memory_pool(
+                    module.bias,
+                    self.process_group,
+                    async_op=True,
+                    module=module,
+                )
+                self.bias_global_handle[module] = bias_handle
+                
             weight_handle = all_gather_raw_memory_pool(
                 module.weight,
                 self.process_group,
@@ -186,6 +208,9 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
         def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
             handle = self.fstp_global_handle[module]
             handle.wait()
+            if module.bias is not None:
+                bias_handle = self.bias_global_handle[module]
+                bias_handle.wait()
 
         def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):
             if module in self.fstp_global_handle:
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 8070cbdc..8a1281e8 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -140,6 +140,21 @@ def all_gather_raw_memory_pool(
     )
     return handle
 
+def all_gather_raw_bias_memory_pool(
+    input_: Tensor,
+    process_group: ProcessGroup,
+    async_op: bool = False,
+    gather_dim: int = 0,
+    module: nn.Module = None,
+):
+    handle = torch.distributed.all_gather_into_tensor(
+        gpc.fstp_handler.get_bias_memory(module=module),
+        input_.contiguous(),
+        group=process_group,
+        async_op=async_op,
+    )
+    return handle
+
 
 def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
     assert my_input.dtype == grad_output.dtype
@@ -486,8 +501,11 @@ def forward(
                 handle_weight.wait()
             # TODO memory pool for bias
             if bias is not None:
-                total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
-                handle_bias.wait()
+                if overlap_handler is not None:
+                    total_bias = gpc.fstp_handler.get_bias_memory(module=module)
+                else:
+                    total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
+                    handle_bias.wait()
             else:
                 total_bias = bias
         else:

From 0d693cf3a182b34cc9af7b6ef640f250ff7abbda Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 23 Oct 2023 15:22:03 +0800
Subject: [PATCH 045/153] feat(model/overlap_handler.py): fix lint error

---
 internlm/model/moe.py             |  1 -
 internlm/model/overlap_handler.py | 40 ++++++++++++++++++-------------
 internlm/model/utils.py           |  1 +
 train.py                          |  3 +--
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/internlm/model/moe.py b/internlm/model/moe.py
index 28e5ae6e..0865097f 100644
--- a/internlm/model/moe.py
+++ b/internlm/model/moe.py
@@ -53,7 +53,6 @@ def __init__(
         device=None,
         dtype=None,
     ):
-
         super().__init__()
 
         assert (
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index f7132c3b..3f7ee055 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -10,7 +10,10 @@
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
-from internlm.model.utils import all_gather_raw_memory_pool, all_gather_raw_bias_memory_pool
+from internlm.model.utils import (
+    all_gather_raw_bias_memory_pool,
+    all_gather_raw_memory_pool,
+)
 from internlm.utils.common import get_current_device
 
 
@@ -25,7 +28,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.fstp_modules = []
         self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
         self.fstp_global_handle = dict()  # key: fstp module; value: module global all-gather op handle
-        self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle
+        self.bias_global_handle = dict()  # key: fstp module; value: module bias global all-gather op handle
         self.module_to_index = dict()  # key: fstp module; value: transformer block index
         self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
         self.head = []
@@ -77,13 +80,13 @@ def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
             self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
 
         return self.zero_const_pool[size]
-    
+
     def _initialize_module_shape(self):
         hidden_size = gpc.config.HIDDEN_SIZE
         mlp_ratio = gpc.config.MLP_RATIO
         mlp_hidden_size = int(hidden_size * mlp_ratio)
         mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
-        
+
         self.module_shape["Wqkv"] = (3 * hidden_size, hidden_size)
         self.module_shape["out_proj"] = (hidden_size, hidden_size)
         self.module_shape["w1"] = (mlp_hidden_size, hidden_size)
@@ -96,7 +99,7 @@ def _initialize_memory_pool(self) -> None:
         self.all_gather_bias_memory_pool = []
         self.reduce_scatter_memory_pool = {}
         self.module_shape = {}
-        
+
         self._initialize_module_shape()
         dtype = gpc.config.model.get("dtype", torch.half)
         device = get_current_device()
@@ -107,10 +110,14 @@ def _initialize_memory_pool(self) -> None:
                 weight[name] = torch.zeros(self.module_shape[name], dtype=dtype, device=device).contiguous()
             self.all_gather_memory_pool.append(weight)  # containing two groups of block weight
 
+    def clear_memory_pool(self) -> None:
+        self.zero_const_pool = {}
+        self.reduce_scatter_memory_pool = {}
+
     def get_all_gather_memory(self, module):
         block_index = self.module_to_index[module]
         return self.all_gather_memory_pool[block_index % 2][module._fstp_name]
-    
+
     def get_bias_memory(self, module: nn.Module):
         block_index = self.module_to_index[module]
         # if the bias memory pool is empty or module has been not allocated memory
@@ -119,19 +126,20 @@ def get_bias_memory(self, module: nn.Module):
             for _ in range(2):
                 weight = {}
                 weight[module._fstp_name] = torch.zeros(
-                                                    self.module_shape[module._fstp_name][0], 
-                                                    dtype=gpc.config.model.get("dtype", torch.half),
-                                                    device=get_current_device()).contiguous()
+                    self.module_shape[module._fstp_name][0],
+                    dtype=gpc.config.model.get("dtype", torch.half),
+                    device=get_current_device(),
+                ).contiguous()
                 self.all_gather_bias_memory_pool.append(weight)
         elif module._fstp_name not in self.all_gather_bias_memory_pool[0]:
             for i in range(2):
                 self.all_gather_bias_memory_pool[i][module._fstp_name] = torch.zeros(
-                                                    self.module_shape[module._fstp_name][0], 
-                                                    dtype=gpc.config.model.get("dtype", torch.half),
-                                                    device=get_current_device()).contiguous()
-        
+                    self.module_shape[module._fstp_name][0],
+                    dtype=gpc.config.model.get("dtype", torch.half),
+                    device=get_current_device(),
+                ).contiguous()
+
         return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
-                
 
     def get_reduce_scatter_memory(self, key):
         return_idx = 0
@@ -170,7 +178,7 @@ def get_reduce_scatter_memory(self, key):
 
     def release_reduce_scatter_memory(self, key, index):
         self.reduce_scatter_memory_pool[key][index].idle = True
-   
+
     def _all_gather_block_weight_memory_pool(self, block_index: int):
         fstp_modules = self.index_to_fstp_modules[block_index]
         for module in fstp_modules:
@@ -182,7 +190,7 @@ def _all_gather_block_weight_memory_pool(self, block_index: int):
                     module=module,
                 )
                 self.bias_global_handle[module] = bias_handle
-                
+
             weight_handle = all_gather_raw_memory_pool(
                 module.weight,
                 self.process_group,
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 8a1281e8..42a84003 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -140,6 +140,7 @@ def all_gather_raw_memory_pool(
     )
     return handle
 
+
 def all_gather_raw_bias_memory_pool(
     input_: Tensor,
     process_group: ProcessGroup,
diff --git a/train.py b/train.py
index 96dc24d1..b4f2a6d2 100644
--- a/train.py
+++ b/train.py
@@ -298,8 +298,7 @@ def main(args):
             prof.step()
 
             if gpc.fstp_handler is not None:
-                gpc.fstp_handler.zero_const_pool = {}
-                gpc.fstp_handler.reduce_scatter_memory_pool = {}
+                gpc.fstp_handler.clear_memory_pool()
             # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 

From 03cc7f9b80bc94c4b3234da8d32674189c66aa5f Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 23 Oct 2023 15:28:34 +0800
Subject: [PATCH 046/153] feat(model/overlap_handler.py): fix lint error

---
 internlm/model/overlap_handler.py | 14 +++++++-------
 internlm/model/utils.py           |  7 ++-----
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 3f7ee055..6870fe68 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -204,27 +204,27 @@ def _register_sync_parameters_hook(self) -> None:
         register forward hooks and backward hooks for fstp modules.
         """
 
-        def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any):
+        def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
             self._all_gather_block_weight_memory_pool(0)
 
-        def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):
+        def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):  # pylint: disable=W0613
             block_index = self.module_to_index[module]
             # start the all-gather for next block
             if block_index + 1 < gpc.config.NUM_LAYER:
                 self._all_gather_block_weight_memory_pool(block_index + 1)
 
-        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):
+        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):  # pylint: disable=W0613
             handle = self.fstp_global_handle[module]
             handle.wait()
             if module.bias is not None:
                 bias_handle = self.bias_global_handle[module]
                 bias_handle.wait()
 
-        def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):
+        def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
             if module in self.fstp_global_handle:
                 del self.fstp_global_handle[module]
 
-        def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
+        def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  # pylint: disable=W0613
             first_backward_module = self.fstp_modules[-1]
             weight_handle = all_gather_raw_memory_pool(
                 first_backward_module.weight,
@@ -234,7 +234,7 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):
             )
             self.fstp_global_handle[first_backward_module] = weight_handle
 
-        def _pre_backward_hook_for_module(module: nn.Module, grad_output):
+        def _pre_backward_hook_for_module(module: nn.Module, grad_output):  # pylint: disable=W0613
             # wait handle for current module
             weight_handle = self.fstp_global_handle[module]
             weight_handle.wait()
@@ -251,7 +251,7 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output):
                 )
                 self.fstp_global_handle[next_module] = weight_handle
 
-        def _post_backward_hook_for_module(module, grad_input, grad_output):
+        def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint: disable=W0613
             if module in self.fstp_global_handle:
                 del self.fstp_global_handle[module]
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 42a84003..982c0e08 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -129,7 +129,6 @@ def all_gather_raw_memory_pool(
     input_: Tensor,
     process_group: ProcessGroup,
     async_op: bool = False,
-    gather_dim: int = 0,
     module: nn.Module = None,
 ):
     handle = torch.distributed.all_gather_into_tensor(
@@ -145,7 +144,6 @@ def all_gather_raw_bias_memory_pool(
     input_: Tensor,
     process_group: ProcessGroup,
     async_op: bool = False,
-    gather_dim: int = 0,
     module: nn.Module = None,
 ):
     handle = torch.distributed.all_gather_into_tensor(
@@ -283,8 +281,8 @@ def backward(ctx, grad_output, *args):
 class MegatronFusedDenseFunc(torch.autograd.Function):
     """
     FusedDenseFunc for tensor parallel in megatron implementation.
-    The diffenrence between the implementation of flash-attn and megatron is that the total_x could be saved for backward in megatron,
-    so that the all-gather in backward is ommited.
+    The diffenrence between the implementation of flash-attn and megatron is that the total_x could be
+    saved for backward in megatron, so that the all-gather in backward is ommited.
     """
 
     @staticmethod
@@ -433,7 +431,6 @@ def backward(ctx, grad_output, *args):
             grad_input = grad_input.contiguous()
         process_group = ctx.process_group
         sequence_parallel = ctx.sequence_parallel
-        gather_dim = ctx.gather_dim
         if ctx.compute_weight_gradient:
             total_x, weight = ctx.saved_tensors
         else:

From 9cf1ff0f6e8a3db1dd1e61fd7b91a056b13041ef Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 23 Oct 2023 15:31:41 +0800
Subject: [PATCH 047/153] feat(solver/optimizer/hybrid_zero_optim.py): minor
 update

---
 internlm/solver/optimizer/hybrid_zero_optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 0d0c8a3b..d2c894c9 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona
             _param.grad.add_(_grad)
 
             # release cuda memory.
-            gpc.fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
+            self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
             self._fstp_handler.reduce_scatter_handlers[_key] = None
 
         bucket.reset_by_rank(reduce_rank)

From b2c1a70477bff8e266dcb3155c2f794dfd7cbf5f Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 23 Oct 2023 15:34:24 +0800
Subject: [PATCH 048/153] feat(train/training_internlm.py): fix lint error

---
 internlm/train/training_internlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index b05611bc..5e874d39 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -50,7 +50,7 @@
 from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
-from internlm.utils.common import DummyProfile, get_current_device
+from internlm.utils.common import DummyProfile
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import sync_model_param, sync_model_param_within_tp

From 0996c47e49bf967aeff2aa83326a77fdcbdd9b64 Mon Sep 17 00:00:00 2001
From: "chenxun.p" <chenxun@senstime.com>
Date: Mon, 23 Oct 2023 16:17:57 +0800
Subject: [PATCH 049/153] fix accumulate grads bug

---
 internlm/solver/optimizer/hybrid_zero_optim.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index d2c894c9..247f8212 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -590,14 +590,14 @@ def step(self, closure=None):
                     if param.grad is not None:
                         self._store_and_try_reduce_grads_by_bucket(param)
 
-        # we need to reduce the gradients left in the communication bucket
-        for group_id in range(self.num_param_groups):
-            self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
-
         # we need to accumulate gradients left in the accumulate gardient bucket
         for group_id in range(self.num_param_groups):
             self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id], reduce_rank=None)
 
+        # we need to reduce the gradients left in the communication bucket
+        for group_id in range(self.num_param_groups):
+            self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
+
         # compute norm for gradients in the before bucket
         groups_norms = []
         for group_id in range(self.num_param_groups):

From 97dcefc3892bea11106a1d0dd6f554a1eea291f0 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 24 Oct 2023 16:13:52 +0800
Subject: [PATCH 050/153] support model activation checkpoint

---
 internlm/core/scheduler/__init__.py       |  3 +-
 internlm/core/scheduler/base_scheduler.py | 41 +-------------
 internlm/model/metrics.py                 | 42 +++++++++++++-
 internlm/model/overlap_handler.py         | 67 ++++++++++++++++++++---
 internlm/utils/evaluation.py              |  3 +-
 train.py                                  | 43 ++++++++++-----
 6 files changed, 131 insertions(+), 68 deletions(-)

diff --git a/internlm/core/scheduler/__init__.py b/internlm/core/scheduler/__init__.py
index a9bf013f..ea6afcd4 100644
--- a/internlm/core/scheduler/__init__.py
+++ b/internlm/core/scheduler/__init__.py
@@ -1,4 +1,4 @@
-from .base_scheduler import BaseScheduler, SchedulerHook, SchedulerMetricHook
+from .base_scheduler import BaseScheduler, SchedulerHook
 from .no_pipeline_scheduler import NonPipelineScheduler
 from .pipeline_scheduler import InterleavedPipelineScheduler, PipelineScheduler
 
@@ -8,5 +8,4 @@
     "InterleavedPipelineScheduler",
     "PipelineScheduler",
     "SchedulerHook",
-    "SchedulerMetricHook",
 ]
diff --git a/internlm/core/scheduler/base_scheduler.py b/internlm/core/scheduler/base_scheduler.py
index 20b44601..fbd878ce 100644
--- a/internlm/core/scheduler/base_scheduler.py
+++ b/internlm/core/scheduler/base_scheduler.py
@@ -4,12 +4,11 @@
 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
 
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Iterable, Optional
+from typing import Any, Callable, Iterable
 
 import torch
 
 from internlm.core.engine import Engine
-from internlm.utils.megatron_timers import megatron_timer as timer
 
 
 class BaseScheduler(ABC):
@@ -147,41 +146,3 @@ def after_backward(self, scheduler, inputs_grad) -> None:
     @abstractmethod
     def post_helper_func(self, scheduler, outputs, label) -> None:
         """A post helper function"""
-
-
-class SchedulerMetricHook(SchedulerHook):
-    """
-    Scheduler Metric Hook.
-    """
-
-    def __init__(self, metric: Optional[Callable] = None, skip: bool = False) -> None:
-        self._post_func = metric
-        self._skip = skip
-
-    def before_forward(self, scheduler, inputs) -> None:
-        if not self._skip:
-            timer("fwd").start()
-
-    def after_forward(self, scheduler, outputs) -> None:
-        if not self._skip:
-            timer("fwd").stop()
-
-    def before_criterion(self, scheduler, outputs, label) -> None:
-        if not self._skip:
-            timer("cal_loss").start()
-
-    def after_criterion(self, scheduler, loss) -> None:
-        if not self._skip:
-            timer("cal_loss").stop()
-
-    def before_backward(self, scheduler, outputs, outputs_grad) -> None:
-        if not self._skip:
-            timer("bwd").start()
-
-    def after_backward(self, scheduler, inputs_grad) -> None:
-        if not self._skip:
-            timer("bwd").stop()
-
-    def post_helper_func(self, scheduler, outputs, label) -> None:
-        if self._post_func is not None:
-            self._post_func(outputs, label)
diff --git a/internlm/model/metrics.py b/internlm/model/metrics.py
index 3a77f8b8..c32d8294 100644
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Callable, List, Optional
 
 import torch
 from flash_attn.losses.cross_entropy import CrossEntropyLoss as FlashCrossEntropyLoss
@@ -6,6 +6,8 @@
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.scheduler import SchedulerHook
+from internlm.utils.megatron_timers import megatron_timer as timer
 
 
 class AccPerplex:
@@ -260,3 +262,41 @@ def get_metric(self, reset=True):
                 self.ds_token_num.fill_(0.0)
 
         return res
+
+
+class SchedulerMetricHook(SchedulerHook):
+    """
+    Scheduler Metric Hook.
+    """
+
+    def __init__(self, metric: Optional[Callable] = None, skip: bool = False) -> None:
+        self._post_func = metric
+        self._skip = skip
+
+    def before_forward(self, scheduler, inputs) -> None:
+        if not self._skip:
+            timer("fwd").start()
+
+    def after_forward(self, scheduler, outputs) -> None:
+        if not self._skip:
+            timer("fwd").stop()
+
+    def before_criterion(self, scheduler, outputs, label) -> None:
+        if not self._skip:
+            timer("cal_loss").start()
+
+    def after_criterion(self, scheduler, loss) -> None:
+        if not self._skip:
+            timer("cal_loss").stop()
+
+    def before_backward(self, scheduler, outputs, outputs_grad) -> None:
+        if not self._skip:
+            timer("bwd").start()
+
+    def after_backward(self, scheduler, inputs_grad) -> None:
+        if not self._skip:
+            timer("bwd").stop()
+
+    def post_helper_func(self, scheduler, outputs, label) -> None:
+        if self._post_func is not None:
+            self._post_func(outputs, label)
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 6870fe68..098fc8c8 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -8,6 +8,7 @@
 
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
+from internlm.core.scheduler import SchedulerHook
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
 from internlm.model.utils import (
@@ -33,6 +34,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
         self.head = []
         self.embedding = []
+        self.model_checkpoint = gpc.config.model.checkpoint
+        self.is_forward = True
 
         self.reduce_scatter_handlers = {}
         self.zero_const_pool = {}
@@ -81,6 +84,9 @@ def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
 
         return self.zero_const_pool[size]
 
+    def set_forward_mode(self, flag):
+        self.is_forward = flag
+
     def _initialize_module_shape(self):
         hidden_size = gpc.config.HIDDEN_SIZE
         mlp_ratio = gpc.config.MLP_RATIO
@@ -121,7 +127,6 @@ def get_all_gather_memory(self, module):
     def get_bias_memory(self, module: nn.Module):
         block_index = self.module_to_index[module]
         # if the bias memory pool is empty or module has been not allocated memory
-        # import pdb; pdb.set_trace()
         if len(self.all_gather_bias_memory_pool) == 0:
             for _ in range(2):
                 weight = {}
@@ -209,9 +214,13 @@ def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any
 
         def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):  # pylint: disable=W0613
             block_index = self.module_to_index[module]
-            # start the all-gather for next block
-            if block_index + 1 < gpc.config.NUM_LAYER:
-                self._all_gather_block_weight_memory_pool(block_index + 1)
+            if self.model_checkpoint and self.is_forward is False:
+                if block_index - 1 >= 0:
+                    self._all_gather_block_weight_memory_pool(block_index - 1)
+            else:
+                # start the all-gather for next block
+                if block_index + 1 < gpc.config.NUM_LAYER:
+                    self._all_gather_block_weight_memory_pool(block_index + 1)
 
         def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):  # pylint: disable=W0613
             handle = self.fstp_global_handle[module]
@@ -234,6 +243,9 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  #
             )
             self.fstp_global_handle[first_backward_module] = weight_handle
 
+        def _pre_backward_hook_for_head(module: nn.Module, grad_output):
+            self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1)
+
         def _pre_backward_hook_for_module(module: nn.Module, grad_output):  # pylint: disable=W0613
             # wait handle for current module
             weight_handle = self.fstp_global_handle[module]
@@ -264,6 +276,10 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
         for embedding in self.embedding:
             embedding.register_forward_hook(_post_forward_hook_for_embedding)
 
+        if self.model_checkpoint and self.is_forward is False:
+            for head in self.head:
+                head.register_full_backward_pre_hook(_pre_backward_hook_for_head)
+
         for out_proj in self.fstp_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
 
@@ -275,9 +291,42 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
         # 1. register post_backward_hook @head module to prefetch for the last block's last module
         # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module
         # 3. register post_backward_hook @fstp_module to release resource
-        for head in self.head:
-            head.register_full_backward_hook(_post_backward_hook_for_head)
+        if gpc.config.model.checkpoint is False:
+            for head in self.head:
+                head.register_full_backward_hook(_post_backward_hook_for_head)
 
-        for module in self.fstp_modules:
-            module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
-            module.register_full_backward_hook(_post_backward_hook_for_module)
+            for module in self.fstp_modules:
+                module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
+                module.register_full_backward_hook(_post_backward_hook_for_module)
+
+
+class FSTPOverlapSchedulerHook(SchedulerHook):
+    """
+    SchedulerHook for fstp overlap handler
+    """
+
+    def __init__(self, overlap_handler: FSTPOverlapHandler) -> None:
+        super().__init__()
+
+        self._overlap_handler = overlap_handler
+
+    def before_forward(self, scheduler, inputs) -> None:
+        self._overlap_handler.set_forward_mode(True)
+
+    def after_forward(self, scheduler, outputs) -> None:
+        pass
+
+    def before_criterion(self, scheduler, outputs, label) -> None:
+        pass
+
+    def after_criterion(self, scheduler, loss) -> None:
+        pass
+
+    def before_backward(self, scheduler, outputs, outputs_grad) -> None:
+        self._overlap_handler.set_forward_mode(False)
+
+    def after_backward(self, scheduler, inputs_grad) -> None:
+        pass
+
+    def post_helper_func(self, scheduler, outputs, label) -> None:
+        pass
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index f708fa78..c6e27a68 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -6,8 +6,7 @@
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.scheduler import SchedulerMetricHook
-from internlm.model.metrics import AccPerplex
+from internlm.model.metrics import AccPerplex, SchedulerMetricHook
 
 
 @contextmanager
diff --git a/train.py b/train.py
index b4f2a6d2..ae867287 100644
--- a/train.py
+++ b/train.py
@@ -5,6 +5,7 @@
 import time
 import traceback
 from functools import partial
+from typing import List, Optional
 
 import torch
 import torch.distributed as dist
@@ -12,11 +13,12 @@
 import internlm
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.scheduler import SchedulerMetricHook
+from internlm.core.scheduler import SchedulerHook
 from internlm.core.trainer import TrainState
 from internlm.initialize import initialize_distributed_env
 from internlm.model.loss import FlashGPTLMLoss
-from internlm.model.metrics import AccPerplex
+from internlm.model.metrics import AccPerplex, SchedulerMetricHook
+from internlm.model.overlap_handler import FSTPOverlapSchedulerHook
 from internlm.monitor import initialize_monitor_manager, send_alert_message
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.train import (
@@ -67,6 +69,30 @@ def initialize_llm_logger(start_time: str):
     return uniscale_logger
 
 
+def get_scheduler_hooks(
+    metric: Optional[AccPerplex] = None, activation_checkpoint: bool = False
+) -> List[SchedulerHook]:
+    scheduler_hooks: List[SchedulerHook] = []
+
+    if metric is not None:
+        scheduler_hooks.append(
+            SchedulerMetricHook(
+                metric=metric,
+                skip=(
+                    gpc.is_using_pp()
+                    and hasattr(gpc.config.model, "num_chunks")
+                    and gpc.config.model.num_chunks > 1
+                    and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
+                ),
+            ),
+        )
+
+    if activation_checkpoint:
+        scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler))
+
+    return scheduler_hooks
+
+
 def main(args):
     # init setting
     skip_batches = gpc.config.data.skip_batches
@@ -149,17 +175,6 @@ def main(args):
     )
 
     # initialize trainer
-    scheduler_hooks = [
-        SchedulerMetricHook(
-            metric=metric,
-            skip=(
-                gpc.is_using_pp()
-                and hasattr(gpc.config.model, "num_chunks")
-                and gpc.config.model.num_chunks > 1
-                and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
-            ),
-        ),
-    ]
 
     trainer, train_dl, _, _ = internlm.initialize_trainer(
         model=model,
@@ -168,7 +183,7 @@ def main(args):
         train_dataloader=train_dl,
         lr_scheduler=lr_scheduler,
         beta2_scheduler=beta2_scheduler,
-        scheduler_hooks=scheduler_hooks,
+        scheduler_hooks=get_scheduler_hooks(metric, gpc.config.model.checkpoint),
     )
 
     # initialize simple memory profiler

From 5d8313693b01769a4239d7938667c3d01a5a3d90 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 24 Oct 2023 17:29:09 +0800
Subject: [PATCH 051/153] feat(model/overlap_handler.py): fix head post
 backward hook when activation

---
 internlm/model/overlap_handler.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 098fc8c8..5cef92f9 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -244,7 +244,8 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  #
             self.fstp_global_handle[first_backward_module] = weight_handle
 
         def _pre_backward_hook_for_head(module: nn.Module, grad_output):
-            self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1)
+            if self.is_forward is False:
+                self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1)
 
         def _pre_backward_hook_for_module(module: nn.Module, grad_output):  # pylint: disable=W0613
             # wait handle for current module
@@ -276,7 +277,7 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
         for embedding in self.embedding:
             embedding.register_forward_hook(_post_forward_hook_for_embedding)
 
-        if self.model_checkpoint and self.is_forward is False:
+        if self.model_checkpoint:
             for head in self.head:
                 head.register_full_backward_pre_hook(_pre_backward_hook_for_head)
 
@@ -291,7 +292,7 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
         # 1. register post_backward_hook @head module to prefetch for the last block's last module
         # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module
         # 3. register post_backward_hook @fstp_module to release resource
-        if gpc.config.model.checkpoint is False:
+        if self.model_checkpoint is False:
             for head in self.head:
                 head.register_full_backward_hook(_post_backward_hook_for_head)
 

From 262de4b796104253139c8082f6f51402618a425e Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Tue, 24 Oct 2023 17:54:26 +0800
Subject: [PATCH 052/153] support tflops computation and generate test py files

---
 .gitignore                                    |   6 +
 configs/13B_template.py                       | 180 +++++++++++++++++
 .../13B_train/131072_flash-attn_ckpt_False.py | 180 +++++++++++++++++
 .../13B_train/131072_flash-attn_ckpt_True.py  | 180 +++++++++++++++++
 configs/13B_train/131072_flash_ckpt_False.py  | 180 +++++++++++++++++
 configs/13B_train/131072_flash_ckpt_True.py   | 180 +++++++++++++++++
 configs/13B_train/131072_intern_ckpt_False.py | 180 +++++++++++++++++
 configs/13B_train/131072_intern_ckpt_True.py  | 180 +++++++++++++++++
 .../13B_train/131072_megatron_ckpt_False.py   | 180 +++++++++++++++++
 .../13B_train/131072_megatron_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/131072_none_ckpt_False.py   | 180 +++++++++++++++++
 configs/13B_train/131072_none_ckpt_True.py    | 180 +++++++++++++++++
 .../13B_train/16384_flash-attn_ckpt_False.py  | 180 +++++++++++++++++
 .../13B_train/16384_flash-attn_ckpt_True.py   | 180 +++++++++++++++++
 configs/13B_train/16384_flash_ckpt_False.py   | 180 +++++++++++++++++
 configs/13B_train/16384_flash_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/16384_intern_ckpt_False.py  | 180 +++++++++++++++++
 configs/13B_train/16384_intern_ckpt_True.py   | 180 +++++++++++++++++
 .../13B_train/16384_megatron_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/16384_megatron_ckpt_True.py | 180 +++++++++++++++++
 configs/13B_train/16384_none_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/16384_none_ckpt_True.py     | 180 +++++++++++++++++
 .../13B_train/262144_flash-attn_ckpt_False.py | 180 +++++++++++++++++
 configs/13B_train/262144_flash_ckpt_False.py  | 180 +++++++++++++++++
 configs/13B_train/262144_flash_ckpt_True.py   | 180 +++++++++++++++++
 configs/13B_train/262144_intern_ckpt_False.py | 180 +++++++++++++++++
 configs/13B_train/262144_intern_ckpt_True.py  | 180 +++++++++++++++++
 .../13B_train/262144_megatron_ckpt_False.py   | 180 +++++++++++++++++
 .../13B_train/262144_megatron_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/262144_none_ckpt_False.py   | 180 +++++++++++++++++
 configs/13B_train/262144_none_ckpt_True.py    | 180 +++++++++++++++++
 .../13B_train/32768_flash-attn_ckpt_False.py  | 180 +++++++++++++++++
 .../13B_train/32768_flash-attn_ckpt_True.py   | 180 +++++++++++++++++
 configs/13B_train/32768_flash_ckpt_False.py   | 180 +++++++++++++++++
 configs/13B_train/32768_flash_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/32768_intern_ckpt_False.py  | 180 +++++++++++++++++
 configs/13B_train/32768_intern_ckpt_True.py   | 180 +++++++++++++++++
 .../13B_train/32768_megatron_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/32768_megatron_ckpt_True.py | 180 +++++++++++++++++
 configs/13B_train/32768_none_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/32768_none_ckpt_True.py     | 180 +++++++++++++++++
 .../13B_train/4096_flash-attn_ckpt_False.py   | 180 +++++++++++++++++
 .../13B_train/4096_flash-attn_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/4096_flash_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/4096_flash_ckpt_True.py     | 180 +++++++++++++++++
 configs/13B_train/4096_intern_ckpt_False.py   | 180 +++++++++++++++++
 configs/13B_train/4096_intern_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/4096_megatron_ckpt_False.py | 180 +++++++++++++++++
 configs/13B_train/4096_megatron_ckpt_True.py  | 180 +++++++++++++++++
 configs/13B_train/4096_none_ckpt_False.py     | 180 +++++++++++++++++
 configs/13B_train/4096_none_ckpt_True.py      | 180 +++++++++++++++++
 .../13B_train/65536_flash-attn_ckpt_False.py  | 180 +++++++++++++++++
 .../13B_train/65536_flash-attn_ckpt_True.py   | 180 +++++++++++++++++
 configs/13B_train/65536_flash_ckpt_False.py   | 180 +++++++++++++++++
 configs/13B_train/65536_flash_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/65536_intern_ckpt_False.py  | 180 +++++++++++++++++
 configs/13B_train/65536_intern_ckpt_True.py   | 180 +++++++++++++++++
 .../13B_train/65536_megatron_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/65536_megatron_ckpt_True.py | 180 +++++++++++++++++
 configs/13B_train/65536_none_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/65536_none_ckpt_True.py     | 180 +++++++++++++++++
 .../13B_train/8192_flash-attn_ckpt_False.py   | 180 +++++++++++++++++
 .../13B_train/8192_flash-attn_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/8192_flash_ckpt_False.py    | 180 +++++++++++++++++
 configs/13B_train/8192_flash_ckpt_True.py     | 180 +++++++++++++++++
 configs/13B_train/8192_intern_ckpt_False.py   | 180 +++++++++++++++++
 configs/13B_train/8192_intern_ckpt_True.py    | 180 +++++++++++++++++
 configs/13B_train/8192_megatron_ckpt_False.py | 180 +++++++++++++++++
 configs/13B_train/8192_megatron_ckpt_True.py  | 180 +++++++++++++++++
 configs/13B_train/8192_none_ckpt_False.py     | 180 +++++++++++++++++
 configs/13B_train/8192_none_ckpt_True.py      | 180 +++++++++++++++++
 configs/30B_template.py                       | 180 +++++++++++++++++
 configs/30B_train/131072_flash_ckpt_False.py  | 180 +++++++++++++++++
 configs/30B_train/131072_flash_ckpt_True.py   | 180 +++++++++++++++++
 configs/30B_train/131072_intern_ckpt_False.py | 180 +++++++++++++++++
 configs/30B_train/131072_intern_ckpt_True.py  | 180 +++++++++++++++++
 .../30B_train/131072_megatron_ckpt_False.py   | 180 +++++++++++++++++
 .../30B_train/131072_megatron_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/131072_none_ckpt_False.py   | 180 +++++++++++++++++
 configs/30B_train/131072_none_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/16384_flash_ckpt_False.py   | 180 +++++++++++++++++
 configs/30B_train/16384_flash_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/16384_intern_ckpt_False.py  | 180 +++++++++++++++++
 configs/30B_train/16384_intern_ckpt_True.py   | 180 +++++++++++++++++
 .../30B_train/16384_megatron_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/16384_megatron_ckpt_True.py | 180 +++++++++++++++++
 configs/30B_train/16384_none_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/16384_none_ckpt_True.py     | 180 +++++++++++++++++
 configs/30B_train/262144_flash_ckpt_False.py  | 180 +++++++++++++++++
 configs/30B_train/262144_flash_ckpt_True.py   | 180 +++++++++++++++++
 configs/30B_train/262144_intern_ckpt_False.py | 180 +++++++++++++++++
 configs/30B_train/262144_intern_ckpt_True.py  | 180 +++++++++++++++++
 .../30B_train/262144_megatron_ckpt_False.py   | 180 +++++++++++++++++
 .../30B_train/262144_megatron_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/262144_none_ckpt_False.py   | 180 +++++++++++++++++
 configs/30B_train/262144_none_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/32768_flash_ckpt_False.py   | 180 +++++++++++++++++
 configs/30B_train/32768_flash_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/32768_intern_ckpt_False.py  | 180 +++++++++++++++++
 configs/30B_train/32768_intern_ckpt_True.py   | 180 +++++++++++++++++
 .../30B_train/32768_megatron_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/32768_megatron_ckpt_True.py | 180 +++++++++++++++++
 configs/30B_train/32768_none_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/32768_none_ckpt_True.py     | 180 +++++++++++++++++
 configs/30B_train/4096_flash_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/4096_flash_ckpt_True.py     | 180 +++++++++++++++++
 configs/30B_train/4096_intern_ckpt_False.py   | 180 +++++++++++++++++
 configs/30B_train/4096_intern_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/4096_megatron_ckpt_False.py | 180 +++++++++++++++++
 configs/30B_train/4096_megatron_ckpt_True.py  | 180 +++++++++++++++++
 configs/30B_train/4096_none_ckpt_False.py     | 180 +++++++++++++++++
 configs/30B_train/4096_none_ckpt_True.py      | 180 +++++++++++++++++
 configs/30B_train/65536_flash_ckpt_False.py   | 180 +++++++++++++++++
 configs/30B_train/65536_flash_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/65536_intern_ckpt_False.py  | 180 +++++++++++++++++
 configs/30B_train/65536_intern_ckpt_True.py   | 180 +++++++++++++++++
 .../30B_train/65536_megatron_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/65536_megatron_ckpt_True.py | 180 +++++++++++++++++
 configs/30B_train/65536_none_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/65536_none_ckpt_True.py     | 180 +++++++++++++++++
 configs/30B_train/8192_flash_ckpt_False.py    | 180 +++++++++++++++++
 configs/30B_train/8192_flash_ckpt_True.py     | 180 +++++++++++++++++
 configs/30B_train/8192_intern_ckpt_False.py   | 180 +++++++++++++++++
 configs/30B_train/8192_intern_ckpt_True.py    | 180 +++++++++++++++++
 configs/30B_train/8192_megatron_ckpt_False.py | 180 +++++++++++++++++
 configs/30B_train/8192_megatron_ckpt_True.py  | 180 +++++++++++++++++
 configs/30B_train/8192_none_ckpt_False.py     | 180 +++++++++++++++++
 configs/30B_train/8192_none_ckpt_True.py      | 180 +++++++++++++++++
 configs/7B_sft.py                             |  12 +-
 configs/7B_template.py                        | 181 ++++++++++++++++++
 .../7B_train/131072_flash-attn_ckpt_False.py  | 181 ++++++++++++++++++
 .../7B_train/131072_flash-attn_ckpt_True.py   | 181 ++++++++++++++++++
 configs/7B_train/131072_flash_ckpt_False.py   | 181 ++++++++++++++++++
 configs/7B_train/131072_flash_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/131072_intern_ckpt_False.py  | 181 ++++++++++++++++++
 configs/7B_train/131072_intern_ckpt_True.py   | 181 ++++++++++++++++++
 .../7B_train/131072_megatron_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/131072_megatron_ckpt_True.py | 181 ++++++++++++++++++
 configs/7B_train/131072_none_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/131072_none_ckpt_True.py     | 181 ++++++++++++++++++
 .../7B_train/16384_flash-attn_ckpt_False.py   | 181 ++++++++++++++++++
 .../7B_train/16384_flash-attn_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/16384_flash_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/16384_flash_ckpt_True.py     | 181 ++++++++++++++++++
 configs/7B_train/16384_intern_ckpt_False.py   | 181 ++++++++++++++++++
 configs/7B_train/16384_intern_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/16384_megatron_ckpt_False.py | 181 ++++++++++++++++++
 configs/7B_train/16384_megatron_ckpt_True.py  | 181 ++++++++++++++++++
 configs/7B_train/16384_none_ckpt_False.py     | 181 ++++++++++++++++++
 configs/7B_train/16384_none_ckpt_True.py      | 181 ++++++++++++++++++
 .../7B_train/262144_flash-attn_ckpt_False.py  | 181 ++++++++++++++++++
 .../7B_train/262144_flash-attn_ckpt_True.py   | 181 ++++++++++++++++++
 configs/7B_train/262144_flash_ckpt_False.py   | 181 ++++++++++++++++++
 configs/7B_train/262144_flash_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/262144_intern_ckpt_False.py  | 181 ++++++++++++++++++
 configs/7B_train/262144_intern_ckpt_True.py   | 181 ++++++++++++++++++
 .../7B_train/262144_megatron_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/262144_megatron_ckpt_True.py | 181 ++++++++++++++++++
 configs/7B_train/262144_none_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/262144_none_ckpt_True.py     | 181 ++++++++++++++++++
 .../7B_train/32768_flash-attn_ckpt_False.py   | 181 ++++++++++++++++++
 .../7B_train/32768_flash-attn_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/32768_flash_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/32768_flash_ckpt_True.py     | 181 ++++++++++++++++++
 configs/7B_train/32768_intern_ckpt_False.py   | 181 ++++++++++++++++++
 configs/7B_train/32768_intern_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/32768_megatron_ckpt_False.py | 181 ++++++++++++++++++
 configs/7B_train/32768_megatron_ckpt_True.py  | 181 ++++++++++++++++++
 configs/7B_train/32768_none_ckpt_False.py     | 181 ++++++++++++++++++
 configs/7B_train/32768_none_ckpt_True.py      | 181 ++++++++++++++++++
 .../7B_train/4096_flash-attn_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/4096_flash-attn_ckpt_True.py | 181 ++++++++++++++++++
 configs/7B_train/4096_flash_ckpt_False.py     | 181 ++++++++++++++++++
 configs/7B_train/4096_flash_ckpt_True.py      | 181 ++++++++++++++++++
 configs/7B_train/4096_intern_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/4096_intern_ckpt_True.py     | 181 ++++++++++++++++++
 configs/7B_train/4096_megatron_ckpt_False.py  | 181 ++++++++++++++++++
 configs/7B_train/4096_megatron_ckpt_True.py   | 181 ++++++++++++++++++
 configs/7B_train/4096_none_ckpt_False.py      | 181 ++++++++++++++++++
 configs/7B_train/4096_none_ckpt_True.py       | 181 ++++++++++++++++++
 .../7B_train/65536_flash-attn_ckpt_False.py   | 181 ++++++++++++++++++
 .../7B_train/65536_flash-attn_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/65536_flash_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/65536_flash_ckpt_True.py     | 181 ++++++++++++++++++
 configs/7B_train/65536_intern_ckpt_False.py   | 181 ++++++++++++++++++
 configs/7B_train/65536_intern_ckpt_True.py    | 181 ++++++++++++++++++
 configs/7B_train/65536_megatron_ckpt_False.py | 181 ++++++++++++++++++
 configs/7B_train/65536_megatron_ckpt_True.py  | 181 ++++++++++++++++++
 configs/7B_train/65536_none_ckpt_False.py     | 181 ++++++++++++++++++
 configs/7B_train/65536_none_ckpt_True.py      | 181 ++++++++++++++++++
 .../7B_train/8192_flash-attn_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/8192_flash-attn_ckpt_True.py | 181 ++++++++++++++++++
 configs/7B_train/8192_flash_ckpt_False.py     | 181 ++++++++++++++++++
 configs/7B_train/8192_flash_ckpt_True.py      | 181 ++++++++++++++++++
 configs/7B_train/8192_intern_ckpt_False.py    | 181 ++++++++++++++++++
 configs/7B_train/8192_intern_ckpt_True.py     | 181 ++++++++++++++++++
 configs/7B_train/8192_megatron_ckpt_False.py  | 181 ++++++++++++++++++
 configs/7B_train/8192_megatron_ckpt_True.py   | 181 ++++++++++++++++++
 configs/7B_train/8192_none_ckpt_False.py      | 181 ++++++++++++++++++
 configs/7B_train/8192_none_ckpt_True.py       | 181 ++++++++++++++++++
 configs/generate.py                           |  44 +++++
 internlm/train/training_internlm.py           |   8 +
 202 files changed, 35775 insertions(+), 6 deletions(-)
 create mode 100644 configs/13B_template.py
 create mode 100644 configs/13B_train/131072_flash-attn_ckpt_False.py
 create mode 100644 configs/13B_train/131072_flash-attn_ckpt_True.py
 create mode 100644 configs/13B_train/131072_flash_ckpt_False.py
 create mode 100644 configs/13B_train/131072_flash_ckpt_True.py
 create mode 100644 configs/13B_train/131072_intern_ckpt_False.py
 create mode 100644 configs/13B_train/131072_intern_ckpt_True.py
 create mode 100644 configs/13B_train/131072_megatron_ckpt_False.py
 create mode 100644 configs/13B_train/131072_megatron_ckpt_True.py
 create mode 100644 configs/13B_train/131072_none_ckpt_False.py
 create mode 100644 configs/13B_train/131072_none_ckpt_True.py
 create mode 100644 configs/13B_train/16384_flash-attn_ckpt_False.py
 create mode 100644 configs/13B_train/16384_flash-attn_ckpt_True.py
 create mode 100644 configs/13B_train/16384_flash_ckpt_False.py
 create mode 100644 configs/13B_train/16384_flash_ckpt_True.py
 create mode 100644 configs/13B_train/16384_intern_ckpt_False.py
 create mode 100644 configs/13B_train/16384_intern_ckpt_True.py
 create mode 100644 configs/13B_train/16384_megatron_ckpt_False.py
 create mode 100644 configs/13B_train/16384_megatron_ckpt_True.py
 create mode 100644 configs/13B_train/16384_none_ckpt_False.py
 create mode 100644 configs/13B_train/16384_none_ckpt_True.py
 create mode 100644 configs/13B_train/262144_flash-attn_ckpt_False.py
 create mode 100644 configs/13B_train/262144_flash_ckpt_False.py
 create mode 100644 configs/13B_train/262144_flash_ckpt_True.py
 create mode 100644 configs/13B_train/262144_intern_ckpt_False.py
 create mode 100644 configs/13B_train/262144_intern_ckpt_True.py
 create mode 100644 configs/13B_train/262144_megatron_ckpt_False.py
 create mode 100644 configs/13B_train/262144_megatron_ckpt_True.py
 create mode 100644 configs/13B_train/262144_none_ckpt_False.py
 create mode 100644 configs/13B_train/262144_none_ckpt_True.py
 create mode 100644 configs/13B_train/32768_flash-attn_ckpt_False.py
 create mode 100644 configs/13B_train/32768_flash-attn_ckpt_True.py
 create mode 100644 configs/13B_train/32768_flash_ckpt_False.py
 create mode 100644 configs/13B_train/32768_flash_ckpt_True.py
 create mode 100644 configs/13B_train/32768_intern_ckpt_False.py
 create mode 100644 configs/13B_train/32768_intern_ckpt_True.py
 create mode 100644 configs/13B_train/32768_megatron_ckpt_False.py
 create mode 100644 configs/13B_train/32768_megatron_ckpt_True.py
 create mode 100644 configs/13B_train/32768_none_ckpt_False.py
 create mode 100644 configs/13B_train/32768_none_ckpt_True.py
 create mode 100644 configs/13B_train/4096_flash-attn_ckpt_False.py
 create mode 100644 configs/13B_train/4096_flash-attn_ckpt_True.py
 create mode 100644 configs/13B_train/4096_flash_ckpt_False.py
 create mode 100644 configs/13B_train/4096_flash_ckpt_True.py
 create mode 100644 configs/13B_train/4096_intern_ckpt_False.py
 create mode 100644 configs/13B_train/4096_intern_ckpt_True.py
 create mode 100644 configs/13B_train/4096_megatron_ckpt_False.py
 create mode 100644 configs/13B_train/4096_megatron_ckpt_True.py
 create mode 100644 configs/13B_train/4096_none_ckpt_False.py
 create mode 100644 configs/13B_train/4096_none_ckpt_True.py
 create mode 100644 configs/13B_train/65536_flash-attn_ckpt_False.py
 create mode 100644 configs/13B_train/65536_flash-attn_ckpt_True.py
 create mode 100644 configs/13B_train/65536_flash_ckpt_False.py
 create mode 100644 configs/13B_train/65536_flash_ckpt_True.py
 create mode 100644 configs/13B_train/65536_intern_ckpt_False.py
 create mode 100644 configs/13B_train/65536_intern_ckpt_True.py
 create mode 100644 configs/13B_train/65536_megatron_ckpt_False.py
 create mode 100644 configs/13B_train/65536_megatron_ckpt_True.py
 create mode 100644 configs/13B_train/65536_none_ckpt_False.py
 create mode 100644 configs/13B_train/65536_none_ckpt_True.py
 create mode 100644 configs/13B_train/8192_flash-attn_ckpt_False.py
 create mode 100644 configs/13B_train/8192_flash-attn_ckpt_True.py
 create mode 100644 configs/13B_train/8192_flash_ckpt_False.py
 create mode 100644 configs/13B_train/8192_flash_ckpt_True.py
 create mode 100644 configs/13B_train/8192_intern_ckpt_False.py
 create mode 100644 configs/13B_train/8192_intern_ckpt_True.py
 create mode 100644 configs/13B_train/8192_megatron_ckpt_False.py
 create mode 100644 configs/13B_train/8192_megatron_ckpt_True.py
 create mode 100644 configs/13B_train/8192_none_ckpt_False.py
 create mode 100644 configs/13B_train/8192_none_ckpt_True.py
 create mode 100644 configs/30B_template.py
 create mode 100644 configs/30B_train/131072_flash_ckpt_False.py
 create mode 100644 configs/30B_train/131072_flash_ckpt_True.py
 create mode 100644 configs/30B_train/131072_intern_ckpt_False.py
 create mode 100644 configs/30B_train/131072_intern_ckpt_True.py
 create mode 100644 configs/30B_train/131072_megatron_ckpt_False.py
 create mode 100644 configs/30B_train/131072_megatron_ckpt_True.py
 create mode 100644 configs/30B_train/131072_none_ckpt_False.py
 create mode 100644 configs/30B_train/131072_none_ckpt_True.py
 create mode 100644 configs/30B_train/16384_flash_ckpt_False.py
 create mode 100644 configs/30B_train/16384_flash_ckpt_True.py
 create mode 100644 configs/30B_train/16384_intern_ckpt_False.py
 create mode 100644 configs/30B_train/16384_intern_ckpt_True.py
 create mode 100644 configs/30B_train/16384_megatron_ckpt_False.py
 create mode 100644 configs/30B_train/16384_megatron_ckpt_True.py
 create mode 100644 configs/30B_train/16384_none_ckpt_False.py
 create mode 100644 configs/30B_train/16384_none_ckpt_True.py
 create mode 100644 configs/30B_train/262144_flash_ckpt_False.py
 create mode 100644 configs/30B_train/262144_flash_ckpt_True.py
 create mode 100644 configs/30B_train/262144_intern_ckpt_False.py
 create mode 100644 configs/30B_train/262144_intern_ckpt_True.py
 create mode 100644 configs/30B_train/262144_megatron_ckpt_False.py
 create mode 100644 configs/30B_train/262144_megatron_ckpt_True.py
 create mode 100644 configs/30B_train/262144_none_ckpt_False.py
 create mode 100644 configs/30B_train/262144_none_ckpt_True.py
 create mode 100644 configs/30B_train/32768_flash_ckpt_False.py
 create mode 100644 configs/30B_train/32768_flash_ckpt_True.py
 create mode 100644 configs/30B_train/32768_intern_ckpt_False.py
 create mode 100644 configs/30B_train/32768_intern_ckpt_True.py
 create mode 100644 configs/30B_train/32768_megatron_ckpt_False.py
 create mode 100644 configs/30B_train/32768_megatron_ckpt_True.py
 create mode 100644 configs/30B_train/32768_none_ckpt_False.py
 create mode 100644 configs/30B_train/32768_none_ckpt_True.py
 create mode 100644 configs/30B_train/4096_flash_ckpt_False.py
 create mode 100644 configs/30B_train/4096_flash_ckpt_True.py
 create mode 100644 configs/30B_train/4096_intern_ckpt_False.py
 create mode 100644 configs/30B_train/4096_intern_ckpt_True.py
 create mode 100644 configs/30B_train/4096_megatron_ckpt_False.py
 create mode 100644 configs/30B_train/4096_megatron_ckpt_True.py
 create mode 100644 configs/30B_train/4096_none_ckpt_False.py
 create mode 100644 configs/30B_train/4096_none_ckpt_True.py
 create mode 100644 configs/30B_train/65536_flash_ckpt_False.py
 create mode 100644 configs/30B_train/65536_flash_ckpt_True.py
 create mode 100644 configs/30B_train/65536_intern_ckpt_False.py
 create mode 100644 configs/30B_train/65536_intern_ckpt_True.py
 create mode 100644 configs/30B_train/65536_megatron_ckpt_False.py
 create mode 100644 configs/30B_train/65536_megatron_ckpt_True.py
 create mode 100644 configs/30B_train/65536_none_ckpt_False.py
 create mode 100644 configs/30B_train/65536_none_ckpt_True.py
 create mode 100644 configs/30B_train/8192_flash_ckpt_False.py
 create mode 100644 configs/30B_train/8192_flash_ckpt_True.py
 create mode 100644 configs/30B_train/8192_intern_ckpt_False.py
 create mode 100644 configs/30B_train/8192_intern_ckpt_True.py
 create mode 100644 configs/30B_train/8192_megatron_ckpt_False.py
 create mode 100644 configs/30B_train/8192_megatron_ckpt_True.py
 create mode 100644 configs/30B_train/8192_none_ckpt_False.py
 create mode 100644 configs/30B_train/8192_none_ckpt_True.py
 create mode 100644 configs/7B_template.py
 create mode 100644 configs/7B_train/131072_flash-attn_ckpt_False.py
 create mode 100644 configs/7B_train/131072_flash-attn_ckpt_True.py
 create mode 100644 configs/7B_train/131072_flash_ckpt_False.py
 create mode 100644 configs/7B_train/131072_flash_ckpt_True.py
 create mode 100644 configs/7B_train/131072_intern_ckpt_False.py
 create mode 100644 configs/7B_train/131072_intern_ckpt_True.py
 create mode 100644 configs/7B_train/131072_megatron_ckpt_False.py
 create mode 100644 configs/7B_train/131072_megatron_ckpt_True.py
 create mode 100644 configs/7B_train/131072_none_ckpt_False.py
 create mode 100644 configs/7B_train/131072_none_ckpt_True.py
 create mode 100644 configs/7B_train/16384_flash-attn_ckpt_False.py
 create mode 100644 configs/7B_train/16384_flash-attn_ckpt_True.py
 create mode 100644 configs/7B_train/16384_flash_ckpt_False.py
 create mode 100644 configs/7B_train/16384_flash_ckpt_True.py
 create mode 100644 configs/7B_train/16384_intern_ckpt_False.py
 create mode 100644 configs/7B_train/16384_intern_ckpt_True.py
 create mode 100644 configs/7B_train/16384_megatron_ckpt_False.py
 create mode 100644 configs/7B_train/16384_megatron_ckpt_True.py
 create mode 100644 configs/7B_train/16384_none_ckpt_False.py
 create mode 100644 configs/7B_train/16384_none_ckpt_True.py
 create mode 100644 configs/7B_train/262144_flash-attn_ckpt_False.py
 create mode 100644 configs/7B_train/262144_flash-attn_ckpt_True.py
 create mode 100644 configs/7B_train/262144_flash_ckpt_False.py
 create mode 100644 configs/7B_train/262144_flash_ckpt_True.py
 create mode 100644 configs/7B_train/262144_intern_ckpt_False.py
 create mode 100644 configs/7B_train/262144_intern_ckpt_True.py
 create mode 100644 configs/7B_train/262144_megatron_ckpt_False.py
 create mode 100644 configs/7B_train/262144_megatron_ckpt_True.py
 create mode 100644 configs/7B_train/262144_none_ckpt_False.py
 create mode 100644 configs/7B_train/262144_none_ckpt_True.py
 create mode 100644 configs/7B_train/32768_flash-attn_ckpt_False.py
 create mode 100644 configs/7B_train/32768_flash-attn_ckpt_True.py
 create mode 100644 configs/7B_train/32768_flash_ckpt_False.py
 create mode 100644 configs/7B_train/32768_flash_ckpt_True.py
 create mode 100644 configs/7B_train/32768_intern_ckpt_False.py
 create mode 100644 configs/7B_train/32768_intern_ckpt_True.py
 create mode 100644 configs/7B_train/32768_megatron_ckpt_False.py
 create mode 100644 configs/7B_train/32768_megatron_ckpt_True.py
 create mode 100644 configs/7B_train/32768_none_ckpt_False.py
 create mode 100644 configs/7B_train/32768_none_ckpt_True.py
 create mode 100644 configs/7B_train/4096_flash-attn_ckpt_False.py
 create mode 100644 configs/7B_train/4096_flash-attn_ckpt_True.py
 create mode 100644 configs/7B_train/4096_flash_ckpt_False.py
 create mode 100644 configs/7B_train/4096_flash_ckpt_True.py
 create mode 100644 configs/7B_train/4096_intern_ckpt_False.py
 create mode 100644 configs/7B_train/4096_intern_ckpt_True.py
 create mode 100644 configs/7B_train/4096_megatron_ckpt_False.py
 create mode 100644 configs/7B_train/4096_megatron_ckpt_True.py
 create mode 100644 configs/7B_train/4096_none_ckpt_False.py
 create mode 100644 configs/7B_train/4096_none_ckpt_True.py
 create mode 100644 configs/7B_train/65536_flash-attn_ckpt_False.py
 create mode 100644 configs/7B_train/65536_flash-attn_ckpt_True.py
 create mode 100644 configs/7B_train/65536_flash_ckpt_False.py
 create mode 100644 configs/7B_train/65536_flash_ckpt_True.py
 create mode 100644 configs/7B_train/65536_intern_ckpt_False.py
 create mode 100644 configs/7B_train/65536_intern_ckpt_True.py
 create mode 100644 configs/7B_train/65536_megatron_ckpt_False.py
 create mode 100644 configs/7B_train/65536_megatron_ckpt_True.py
 create mode 100644 configs/7B_train/65536_none_ckpt_False.py
 create mode 100644 configs/7B_train/65536_none_ckpt_True.py
 create mode 100644 configs/7B_train/8192_flash-attn_ckpt_False.py
 create mode 100644 configs/7B_train/8192_flash-attn_ckpt_True.py
 create mode 100644 configs/7B_train/8192_flash_ckpt_False.py
 create mode 100644 configs/7B_train/8192_flash_ckpt_True.py
 create mode 100644 configs/7B_train/8192_intern_ckpt_False.py
 create mode 100644 configs/7B_train/8192_intern_ckpt_True.py
 create mode 100644 configs/7B_train/8192_megatron_ckpt_False.py
 create mode 100644 configs/7B_train/8192_megatron_ckpt_True.py
 create mode 100644 configs/7B_train/8192_none_ckpt_False.py
 create mode 100644 configs/7B_train/8192_none_ckpt_True.py
 create mode 100644 configs/generate.py

diff --git a/.gitignore b/.gitignore
index 8992a0f5..04367e3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -145,3 +145,9 @@ core.*
 llm_ckpts
 events.*
 memory_trace
+7b_train*/
+13b_train*/
+30b_train*/
+fstp_logs/
+atb
+pip
diff --git a/configs/13B_template.py b/configs/13B_template.py
new file mode 100644
index 00000000..26be3f71
--- /dev/null
+++ b/configs/13B_template.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = {seq_len}
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_flash-attn_ckpt_False.py b/configs/13B_train/131072_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..28d51af6
--- /dev/null
+++ b/configs/13B_train/131072_flash-attn_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_flash-attn_ckpt_True.py b/configs/13B_train/131072_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..6d1b7ef0
--- /dev/null
+++ b/configs/13B_train/131072_flash-attn_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_flash_ckpt_False.py b/configs/13B_train/131072_flash_ckpt_False.py
new file mode 100644
index 00000000..dd0f0e89
--- /dev/null
+++ b/configs/13B_train/131072_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_flash_ckpt_True.py b/configs/13B_train/131072_flash_ckpt_True.py
new file mode 100644
index 00000000..2b9276db
--- /dev/null
+++ b/configs/13B_train/131072_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_intern_ckpt_False.py b/configs/13B_train/131072_intern_ckpt_False.py
new file mode 100644
index 00000000..182e4ddb
--- /dev/null
+++ b/configs/13B_train/131072_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_intern_ckpt_True.py b/configs/13B_train/131072_intern_ckpt_True.py
new file mode 100644
index 00000000..c23a3c10
--- /dev/null
+++ b/configs/13B_train/131072_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_megatron_ckpt_False.py b/configs/13B_train/131072_megatron_ckpt_False.py
new file mode 100644
index 00000000..935ff98d
--- /dev/null
+++ b/configs/13B_train/131072_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_megatron_ckpt_True.py b/configs/13B_train/131072_megatron_ckpt_True.py
new file mode 100644
index 00000000..441166c2
--- /dev/null
+++ b/configs/13B_train/131072_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_none_ckpt_False.py b/configs/13B_train/131072_none_ckpt_False.py
new file mode 100644
index 00000000..e43d6044
--- /dev/null
+++ b/configs/13B_train/131072_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/131072_none_ckpt_True.py b/configs/13B_train/131072_none_ckpt_True.py
new file mode 100644
index 00000000..0945dbdc
--- /dev/null
+++ b/configs/13B_train/131072_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_flash-attn_ckpt_False.py b/configs/13B_train/16384_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..393e54d3
--- /dev/null
+++ b/configs/13B_train/16384_flash-attn_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_flash-attn_ckpt_True.py b/configs/13B_train/16384_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..7f7e7ac6
--- /dev/null
+++ b/configs/13B_train/16384_flash-attn_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_flash_ckpt_False.py b/configs/13B_train/16384_flash_ckpt_False.py
new file mode 100644
index 00000000..cadd215f
--- /dev/null
+++ b/configs/13B_train/16384_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_flash_ckpt_True.py b/configs/13B_train/16384_flash_ckpt_True.py
new file mode 100644
index 00000000..c60ea730
--- /dev/null
+++ b/configs/13B_train/16384_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_intern_ckpt_False.py b/configs/13B_train/16384_intern_ckpt_False.py
new file mode 100644
index 00000000..e5d6fa6b
--- /dev/null
+++ b/configs/13B_train/16384_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_intern_ckpt_True.py b/configs/13B_train/16384_intern_ckpt_True.py
new file mode 100644
index 00000000..6ac47ac2
--- /dev/null
+++ b/configs/13B_train/16384_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_megatron_ckpt_False.py b/configs/13B_train/16384_megatron_ckpt_False.py
new file mode 100644
index 00000000..24429ead
--- /dev/null
+++ b/configs/13B_train/16384_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_megatron_ckpt_True.py b/configs/13B_train/16384_megatron_ckpt_True.py
new file mode 100644
index 00000000..d79c8207
--- /dev/null
+++ b/configs/13B_train/16384_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_none_ckpt_False.py b/configs/13B_train/16384_none_ckpt_False.py
new file mode 100644
index 00000000..a30d713a
--- /dev/null
+++ b/configs/13B_train/16384_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/16384_none_ckpt_True.py b/configs/13B_train/16384_none_ckpt_True.py
new file mode 100644
index 00000000..76483257
--- /dev/null
+++ b/configs/13B_train/16384_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_flash-attn_ckpt_False.py b/configs/13B_train/262144_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..fd0be6a7
--- /dev/null
+++ b/configs/13B_train/262144_flash-attn_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_flash_ckpt_False.py b/configs/13B_train/262144_flash_ckpt_False.py
new file mode 100644
index 00000000..5ca332ef
--- /dev/null
+++ b/configs/13B_train/262144_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_flash_ckpt_True.py b/configs/13B_train/262144_flash_ckpt_True.py
new file mode 100644
index 00000000..f990655a
--- /dev/null
+++ b/configs/13B_train/262144_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_intern_ckpt_False.py b/configs/13B_train/262144_intern_ckpt_False.py
new file mode 100644
index 00000000..7ebcf94f
--- /dev/null
+++ b/configs/13B_train/262144_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_intern_ckpt_True.py b/configs/13B_train/262144_intern_ckpt_True.py
new file mode 100644
index 00000000..e958ac06
--- /dev/null
+++ b/configs/13B_train/262144_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_megatron_ckpt_False.py b/configs/13B_train/262144_megatron_ckpt_False.py
new file mode 100644
index 00000000..31e96f78
--- /dev/null
+++ b/configs/13B_train/262144_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_megatron_ckpt_True.py b/configs/13B_train/262144_megatron_ckpt_True.py
new file mode 100644
index 00000000..2339244b
--- /dev/null
+++ b/configs/13B_train/262144_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_none_ckpt_False.py b/configs/13B_train/262144_none_ckpt_False.py
new file mode 100644
index 00000000..41d55e91
--- /dev/null
+++ b/configs/13B_train/262144_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/262144_none_ckpt_True.py b/configs/13B_train/262144_none_ckpt_True.py
new file mode 100644
index 00000000..4f2da605
--- /dev/null
+++ b/configs/13B_train/262144_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_flash-attn_ckpt_False.py b/configs/13B_train/32768_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..3eb0f493
--- /dev/null
+++ b/configs/13B_train/32768_flash-attn_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_flash-attn_ckpt_True.py b/configs/13B_train/32768_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..26b06ef3
--- /dev/null
+++ b/configs/13B_train/32768_flash-attn_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_flash_ckpt_False.py b/configs/13B_train/32768_flash_ckpt_False.py
new file mode 100644
index 00000000..da30a4dd
--- /dev/null
+++ b/configs/13B_train/32768_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_flash_ckpt_True.py b/configs/13B_train/32768_flash_ckpt_True.py
new file mode 100644
index 00000000..20d415a5
--- /dev/null
+++ b/configs/13B_train/32768_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_intern_ckpt_False.py b/configs/13B_train/32768_intern_ckpt_False.py
new file mode 100644
index 00000000..05ab5285
--- /dev/null
+++ b/configs/13B_train/32768_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_intern_ckpt_True.py b/configs/13B_train/32768_intern_ckpt_True.py
new file mode 100644
index 00000000..273a812d
--- /dev/null
+++ b/configs/13B_train/32768_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_megatron_ckpt_False.py b/configs/13B_train/32768_megatron_ckpt_False.py
new file mode 100644
index 00000000..c8db542d
--- /dev/null
+++ b/configs/13B_train/32768_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_megatron_ckpt_True.py b/configs/13B_train/32768_megatron_ckpt_True.py
new file mode 100644
index 00000000..9ff56012
--- /dev/null
+++ b/configs/13B_train/32768_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_none_ckpt_False.py b/configs/13B_train/32768_none_ckpt_False.py
new file mode 100644
index 00000000..a02e0711
--- /dev/null
+++ b/configs/13B_train/32768_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/32768_none_ckpt_True.py b/configs/13B_train/32768_none_ckpt_True.py
new file mode 100644
index 00000000..b9b17e3c
--- /dev/null
+++ b/configs/13B_train/32768_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_flash-attn_ckpt_False.py b/configs/13B_train/4096_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..8e4459ea
--- /dev/null
+++ b/configs/13B_train/4096_flash-attn_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_flash-attn_ckpt_True.py b/configs/13B_train/4096_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..a8f5e39b
--- /dev/null
+++ b/configs/13B_train/4096_flash-attn_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_flash_ckpt_False.py b/configs/13B_train/4096_flash_ckpt_False.py
new file mode 100644
index 00000000..517b46e4
--- /dev/null
+++ b/configs/13B_train/4096_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_flash_ckpt_True.py b/configs/13B_train/4096_flash_ckpt_True.py
new file mode 100644
index 00000000..eacfcdfd
--- /dev/null
+++ b/configs/13B_train/4096_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_intern_ckpt_False.py b/configs/13B_train/4096_intern_ckpt_False.py
new file mode 100644
index 00000000..5ecf2d66
--- /dev/null
+++ b/configs/13B_train/4096_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_intern_ckpt_True.py b/configs/13B_train/4096_intern_ckpt_True.py
new file mode 100644
index 00000000..b70acb01
--- /dev/null
+++ b/configs/13B_train/4096_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_megatron_ckpt_False.py b/configs/13B_train/4096_megatron_ckpt_False.py
new file mode 100644
index 00000000..2e847a64
--- /dev/null
+++ b/configs/13B_train/4096_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_megatron_ckpt_True.py b/configs/13B_train/4096_megatron_ckpt_True.py
new file mode 100644
index 00000000..d8ba2c57
--- /dev/null
+++ b/configs/13B_train/4096_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_none_ckpt_False.py b/configs/13B_train/4096_none_ckpt_False.py
new file mode 100644
index 00000000..f8bbdfc5
--- /dev/null
+++ b/configs/13B_train/4096_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/4096_none_ckpt_True.py b/configs/13B_train/4096_none_ckpt_True.py
new file mode 100644
index 00000000..d8f8ec7e
--- /dev/null
+++ b/configs/13B_train/4096_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_flash-attn_ckpt_False.py b/configs/13B_train/65536_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..09367f5a
--- /dev/null
+++ b/configs/13B_train/65536_flash-attn_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_flash-attn_ckpt_True.py b/configs/13B_train/65536_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..dc283a92
--- /dev/null
+++ b/configs/13B_train/65536_flash-attn_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_flash_ckpt_False.py b/configs/13B_train/65536_flash_ckpt_False.py
new file mode 100644
index 00000000..482d5114
--- /dev/null
+++ b/configs/13B_train/65536_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_flash_ckpt_True.py b/configs/13B_train/65536_flash_ckpt_True.py
new file mode 100644
index 00000000..66051f83
--- /dev/null
+++ b/configs/13B_train/65536_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_intern_ckpt_False.py b/configs/13B_train/65536_intern_ckpt_False.py
new file mode 100644
index 00000000..f829652a
--- /dev/null
+++ b/configs/13B_train/65536_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_intern_ckpt_True.py b/configs/13B_train/65536_intern_ckpt_True.py
new file mode 100644
index 00000000..4e94d0e3
--- /dev/null
+++ b/configs/13B_train/65536_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_megatron_ckpt_False.py b/configs/13B_train/65536_megatron_ckpt_False.py
new file mode 100644
index 00000000..a9293334
--- /dev/null
+++ b/configs/13B_train/65536_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_megatron_ckpt_True.py b/configs/13B_train/65536_megatron_ckpt_True.py
new file mode 100644
index 00000000..845e32bc
--- /dev/null
+++ b/configs/13B_train/65536_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_none_ckpt_False.py b/configs/13B_train/65536_none_ckpt_False.py
new file mode 100644
index 00000000..52ce3c52
--- /dev/null
+++ b/configs/13B_train/65536_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/65536_none_ckpt_True.py b/configs/13B_train/65536_none_ckpt_True.py
new file mode 100644
index 00000000..de5532e1
--- /dev/null
+++ b/configs/13B_train/65536_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_flash-attn_ckpt_False.py b/configs/13B_train/8192_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..3324c290
--- /dev/null
+++ b/configs/13B_train/8192_flash-attn_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_flash-attn_ckpt_True.py b/configs/13B_train/8192_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..317e0f32
--- /dev/null
+++ b/configs/13B_train/8192_flash-attn_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_flash_ckpt_False.py b/configs/13B_train/8192_flash_ckpt_False.py
new file mode 100644
index 00000000..d645dc1b
--- /dev/null
+++ b/configs/13B_train/8192_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_flash_ckpt_True.py b/configs/13B_train/8192_flash_ckpt_True.py
new file mode 100644
index 00000000..425859c0
--- /dev/null
+++ b/configs/13B_train/8192_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_intern_ckpt_False.py b/configs/13B_train/8192_intern_ckpt_False.py
new file mode 100644
index 00000000..0b4fb8a2
--- /dev/null
+++ b/configs/13B_train/8192_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_intern_ckpt_True.py b/configs/13B_train/8192_intern_ckpt_True.py
new file mode 100644
index 00000000..b42cb769
--- /dev/null
+++ b/configs/13B_train/8192_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_megatron_ckpt_False.py b/configs/13B_train/8192_megatron_ckpt_False.py
new file mode 100644
index 00000000..e2191937
--- /dev/null
+++ b/configs/13B_train/8192_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_megatron_ckpt_True.py b/configs/13B_train/8192_megatron_ckpt_True.py
new file mode 100644
index 00000000..5123c412
--- /dev/null
+++ b/configs/13B_train/8192_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_none_ckpt_False.py b/configs/13B_train/8192_none_ckpt_False.py
new file mode 100644
index 00000000..c9d9c050
--- /dev/null
+++ b/configs/13B_train/8192_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/13B_train/8192_none_ckpt_True.py b/configs/13B_train/8192_none_ckpt_True.py
new file mode 100644
index 00000000..182ec21f
--- /dev/null
+++ b/configs/13B_train/8192_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 5120
+NUM_ATTENTION_HEAD = 40
+MLP_RATIO = 8 / 3
+NUM_LAYER = 40
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_template.py b/configs/30B_template.py
new file mode 100644
index 00000000..7a32015e
--- /dev/null
+++ b/configs/30B_template.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = {seq_len}
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_flash_ckpt_False.py b/configs/30B_train/131072_flash_ckpt_False.py
new file mode 100644
index 00000000..3af48f3e
--- /dev/null
+++ b/configs/30B_train/131072_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_flash_ckpt_True.py b/configs/30B_train/131072_flash_ckpt_True.py
new file mode 100644
index 00000000..4bd249bc
--- /dev/null
+++ b/configs/30B_train/131072_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_intern_ckpt_False.py b/configs/30B_train/131072_intern_ckpt_False.py
new file mode 100644
index 00000000..77b176d2
--- /dev/null
+++ b/configs/30B_train/131072_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_intern_ckpt_True.py b/configs/30B_train/131072_intern_ckpt_True.py
new file mode 100644
index 00000000..38a1db3b
--- /dev/null
+++ b/configs/30B_train/131072_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_megatron_ckpt_False.py b/configs/30B_train/131072_megatron_ckpt_False.py
new file mode 100644
index 00000000..49879303
--- /dev/null
+++ b/configs/30B_train/131072_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_megatron_ckpt_True.py b/configs/30B_train/131072_megatron_ckpt_True.py
new file mode 100644
index 00000000..d911d381
--- /dev/null
+++ b/configs/30B_train/131072_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_none_ckpt_False.py b/configs/30B_train/131072_none_ckpt_False.py
new file mode 100644
index 00000000..78b3c9a8
--- /dev/null
+++ b/configs/30B_train/131072_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/131072_none_ckpt_True.py b/configs/30B_train/131072_none_ckpt_True.py
new file mode 100644
index 00000000..941279e7
--- /dev/null
+++ b/configs/30B_train/131072_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_flash_ckpt_False.py b/configs/30B_train/16384_flash_ckpt_False.py
new file mode 100644
index 00000000..779a10bc
--- /dev/null
+++ b/configs/30B_train/16384_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_flash_ckpt_True.py b/configs/30B_train/16384_flash_ckpt_True.py
new file mode 100644
index 00000000..0498e2c4
--- /dev/null
+++ b/configs/30B_train/16384_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_intern_ckpt_False.py b/configs/30B_train/16384_intern_ckpt_False.py
new file mode 100644
index 00000000..309a33f0
--- /dev/null
+++ b/configs/30B_train/16384_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_intern_ckpt_True.py b/configs/30B_train/16384_intern_ckpt_True.py
new file mode 100644
index 00000000..23c977a5
--- /dev/null
+++ b/configs/30B_train/16384_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_megatron_ckpt_False.py b/configs/30B_train/16384_megatron_ckpt_False.py
new file mode 100644
index 00000000..8576aa76
--- /dev/null
+++ b/configs/30B_train/16384_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_megatron_ckpt_True.py b/configs/30B_train/16384_megatron_ckpt_True.py
new file mode 100644
index 00000000..460aba3b
--- /dev/null
+++ b/configs/30B_train/16384_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_none_ckpt_False.py b/configs/30B_train/16384_none_ckpt_False.py
new file mode 100644
index 00000000..4ca50666
--- /dev/null
+++ b/configs/30B_train/16384_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/16384_none_ckpt_True.py b/configs/30B_train/16384_none_ckpt_True.py
new file mode 100644
index 00000000..c7987e0d
--- /dev/null
+++ b/configs/30B_train/16384_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_flash_ckpt_False.py b/configs/30B_train/262144_flash_ckpt_False.py
new file mode 100644
index 00000000..10d71d9c
--- /dev/null
+++ b/configs/30B_train/262144_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_flash_ckpt_True.py b/configs/30B_train/262144_flash_ckpt_True.py
new file mode 100644
index 00000000..a1990dbb
--- /dev/null
+++ b/configs/30B_train/262144_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_intern_ckpt_False.py b/configs/30B_train/262144_intern_ckpt_False.py
new file mode 100644
index 00000000..f8ec6a2f
--- /dev/null
+++ b/configs/30B_train/262144_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_intern_ckpt_True.py b/configs/30B_train/262144_intern_ckpt_True.py
new file mode 100644
index 00000000..c5afa46b
--- /dev/null
+++ b/configs/30B_train/262144_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_megatron_ckpt_False.py b/configs/30B_train/262144_megatron_ckpt_False.py
new file mode 100644
index 00000000..412da179
--- /dev/null
+++ b/configs/30B_train/262144_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_megatron_ckpt_True.py b/configs/30B_train/262144_megatron_ckpt_True.py
new file mode 100644
index 00000000..79affb19
--- /dev/null
+++ b/configs/30B_train/262144_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_none_ckpt_False.py b/configs/30B_train/262144_none_ckpt_False.py
new file mode 100644
index 00000000..e6fbe1eb
--- /dev/null
+++ b/configs/30B_train/262144_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/262144_none_ckpt_True.py b/configs/30B_train/262144_none_ckpt_True.py
new file mode 100644
index 00000000..d507c30b
--- /dev/null
+++ b/configs/30B_train/262144_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_flash_ckpt_False.py b/configs/30B_train/32768_flash_ckpt_False.py
new file mode 100644
index 00000000..6bac5b31
--- /dev/null
+++ b/configs/30B_train/32768_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_flash_ckpt_True.py b/configs/30B_train/32768_flash_ckpt_True.py
new file mode 100644
index 00000000..f21c9983
--- /dev/null
+++ b/configs/30B_train/32768_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_intern_ckpt_False.py b/configs/30B_train/32768_intern_ckpt_False.py
new file mode 100644
index 00000000..79728d64
--- /dev/null
+++ b/configs/30B_train/32768_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_intern_ckpt_True.py b/configs/30B_train/32768_intern_ckpt_True.py
new file mode 100644
index 00000000..6dc24c30
--- /dev/null
+++ b/configs/30B_train/32768_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_megatron_ckpt_False.py b/configs/30B_train/32768_megatron_ckpt_False.py
new file mode 100644
index 00000000..37fd0986
--- /dev/null
+++ b/configs/30B_train/32768_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_megatron_ckpt_True.py b/configs/30B_train/32768_megatron_ckpt_True.py
new file mode 100644
index 00000000..986b27dd
--- /dev/null
+++ b/configs/30B_train/32768_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_none_ckpt_False.py b/configs/30B_train/32768_none_ckpt_False.py
new file mode 100644
index 00000000..9c6ca879
--- /dev/null
+++ b/configs/30B_train/32768_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/32768_none_ckpt_True.py b/configs/30B_train/32768_none_ckpt_True.py
new file mode 100644
index 00000000..d4ab7f2d
--- /dev/null
+++ b/configs/30B_train/32768_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_flash_ckpt_False.py b/configs/30B_train/4096_flash_ckpt_False.py
new file mode 100644
index 00000000..3dd8be56
--- /dev/null
+++ b/configs/30B_train/4096_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_flash_ckpt_True.py b/configs/30B_train/4096_flash_ckpt_True.py
new file mode 100644
index 00000000..73150acf
--- /dev/null
+++ b/configs/30B_train/4096_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_intern_ckpt_False.py b/configs/30B_train/4096_intern_ckpt_False.py
new file mode 100644
index 00000000..cff6c5b6
--- /dev/null
+++ b/configs/30B_train/4096_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_intern_ckpt_True.py b/configs/30B_train/4096_intern_ckpt_True.py
new file mode 100644
index 00000000..1fb64257
--- /dev/null
+++ b/configs/30B_train/4096_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_megatron_ckpt_False.py b/configs/30B_train/4096_megatron_ckpt_False.py
new file mode 100644
index 00000000..79f718d0
--- /dev/null
+++ b/configs/30B_train/4096_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_megatron_ckpt_True.py b/configs/30B_train/4096_megatron_ckpt_True.py
new file mode 100644
index 00000000..502ae7f7
--- /dev/null
+++ b/configs/30B_train/4096_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_none_ckpt_False.py b/configs/30B_train/4096_none_ckpt_False.py
new file mode 100644
index 00000000..981a0f23
--- /dev/null
+++ b/configs/30B_train/4096_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/4096_none_ckpt_True.py b/configs/30B_train/4096_none_ckpt_True.py
new file mode 100644
index 00000000..dddea663
--- /dev/null
+++ b/configs/30B_train/4096_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_flash_ckpt_False.py b/configs/30B_train/65536_flash_ckpt_False.py
new file mode 100644
index 00000000..babebd95
--- /dev/null
+++ b/configs/30B_train/65536_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_flash_ckpt_True.py b/configs/30B_train/65536_flash_ckpt_True.py
new file mode 100644
index 00000000..064250e7
--- /dev/null
+++ b/configs/30B_train/65536_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_intern_ckpt_False.py b/configs/30B_train/65536_intern_ckpt_False.py
new file mode 100644
index 00000000..64165f44
--- /dev/null
+++ b/configs/30B_train/65536_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_intern_ckpt_True.py b/configs/30B_train/65536_intern_ckpt_True.py
new file mode 100644
index 00000000..78b66213
--- /dev/null
+++ b/configs/30B_train/65536_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_megatron_ckpt_False.py b/configs/30B_train/65536_megatron_ckpt_False.py
new file mode 100644
index 00000000..e8c09548
--- /dev/null
+++ b/configs/30B_train/65536_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_megatron_ckpt_True.py b/configs/30B_train/65536_megatron_ckpt_True.py
new file mode 100644
index 00000000..d3b64c41
--- /dev/null
+++ b/configs/30B_train/65536_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_none_ckpt_False.py b/configs/30B_train/65536_none_ckpt_False.py
new file mode 100644
index 00000000..ee4c7fb5
--- /dev/null
+++ b/configs/30B_train/65536_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/65536_none_ckpt_True.py b/configs/30B_train/65536_none_ckpt_True.py
new file mode 100644
index 00000000..2e84144c
--- /dev/null
+++ b/configs/30B_train/65536_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_flash_ckpt_False.py b/configs/30B_train/8192_flash_ckpt_False.py
new file mode 100644
index 00000000..b9eb6e65
--- /dev/null
+++ b/configs/30B_train/8192_flash_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_flash_ckpt_True.py b/configs/30B_train/8192_flash_ckpt_True.py
new file mode 100644
index 00000000..c0dd5175
--- /dev/null
+++ b/configs/30B_train/8192_flash_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_intern_ckpt_False.py b/configs/30B_train/8192_intern_ckpt_False.py
new file mode 100644
index 00000000..d915b6b8
--- /dev/null
+++ b/configs/30B_train/8192_intern_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_intern_ckpt_True.py b/configs/30B_train/8192_intern_ckpt_True.py
new file mode 100644
index 00000000..a71693a1
--- /dev/null
+++ b/configs/30B_train/8192_intern_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_megatron_ckpt_False.py b/configs/30B_train/8192_megatron_ckpt_False.py
new file mode 100644
index 00000000..dcacb9e5
--- /dev/null
+++ b/configs/30B_train/8192_megatron_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_megatron_ckpt_True.py b/configs/30B_train/8192_megatron_ckpt_True.py
new file mode 100644
index 00000000..b6e4ba24
--- /dev/null
+++ b/configs/30B_train/8192_megatron_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_none_ckpt_False.py b/configs/30B_train/8192_none_ckpt_False.py
new file mode 100644
index 00000000..ce790dfa
--- /dev/null
+++ b/configs/30B_train/8192_none_ckpt_False.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/30B_train/8192_none_ckpt_True.py b/configs/30B_train/8192_none_ckpt_True.py
new file mode 100644
index 00000000..e6afcd4e
--- /dev/null
+++ b/configs/30B_train/8192_none_ckpt_True.py
@@ -0,0 +1,180 @@
+
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN)
+HIDDEN_SIZE = 6144
+NUM_ATTENTION_HEAD = 48
+MLP_RATIO = 8 / 3
+NUM_LAYER = 60
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
+        the sequence_parallel should be True.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+sequence parallel (bool): enable/disable sequence parallel, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index c51c8129..4f482656 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -49,14 +49,14 @@
 data = dict(
     seq_len=SEQ_LEN,
     # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
+    micro_num=1,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=4,
+    micro_bsz=1,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=False,
+    pack_sample_into_one=True,
     total_steps=20,
     skip_batches="",
     rampup_batch_size="",
@@ -64,7 +64,7 @@
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 
@@ -90,7 +90,7 @@
 hybrid_zero_optimizer = dict(
     # Enable low_level_optimzer overlap_communication
     overlap_sync_grad=True,
-    overlap_sync_param=True,
+    overlap_sync_param=False,
     # bucket size for nccl communication params
     reduce_bucket_size=512 * 1024 * 1024,
     # grad clipping
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/configs/7B_template.py b/configs/7B_template.py
new file mode 100644
index 00000000..b9f76a51
--- /dev/null
+++ b/configs/7B_template.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = {seq_len}
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_flash-attn_ckpt_False.py b/configs/7B_train/131072_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..047fb372
--- /dev/null
+++ b/configs/7B_train/131072_flash-attn_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_flash-attn_ckpt_True.py b/configs/7B_train/131072_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..763627d6
--- /dev/null
+++ b/configs/7B_train/131072_flash-attn_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_flash_ckpt_False.py b/configs/7B_train/131072_flash_ckpt_False.py
new file mode 100644
index 00000000..4307e9d1
--- /dev/null
+++ b/configs/7B_train/131072_flash_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_flash_ckpt_True.py b/configs/7B_train/131072_flash_ckpt_True.py
new file mode 100644
index 00000000..c110b256
--- /dev/null
+++ b/configs/7B_train/131072_flash_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_intern_ckpt_False.py b/configs/7B_train/131072_intern_ckpt_False.py
new file mode 100644
index 00000000..1d728be7
--- /dev/null
+++ b/configs/7B_train/131072_intern_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_intern_ckpt_True.py b/configs/7B_train/131072_intern_ckpt_True.py
new file mode 100644
index 00000000..45d4aa01
--- /dev/null
+++ b/configs/7B_train/131072_intern_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_megatron_ckpt_False.py b/configs/7B_train/131072_megatron_ckpt_False.py
new file mode 100644
index 00000000..0bd98459
--- /dev/null
+++ b/configs/7B_train/131072_megatron_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_megatron_ckpt_True.py b/configs/7B_train/131072_megatron_ckpt_True.py
new file mode 100644
index 00000000..9200afbe
--- /dev/null
+++ b/configs/7B_train/131072_megatron_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_none_ckpt_False.py b/configs/7B_train/131072_none_ckpt_False.py
new file mode 100644
index 00000000..16059fb1
--- /dev/null
+++ b/configs/7B_train/131072_none_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/131072_none_ckpt_True.py b/configs/7B_train/131072_none_ckpt_True.py
new file mode 100644
index 00000000..35b3f08e
--- /dev/null
+++ b/configs/7B_train/131072_none_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 131072
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_flash-attn_ckpt_False.py b/configs/7B_train/16384_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..53a64b99
--- /dev/null
+++ b/configs/7B_train/16384_flash-attn_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_flash-attn_ckpt_True.py b/configs/7B_train/16384_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..cdb051e5
--- /dev/null
+++ b/configs/7B_train/16384_flash-attn_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_flash_ckpt_False.py b/configs/7B_train/16384_flash_ckpt_False.py
new file mode 100644
index 00000000..41b39515
--- /dev/null
+++ b/configs/7B_train/16384_flash_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_flash_ckpt_True.py b/configs/7B_train/16384_flash_ckpt_True.py
new file mode 100644
index 00000000..ca2c7f06
--- /dev/null
+++ b/configs/7B_train/16384_flash_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_intern_ckpt_False.py b/configs/7B_train/16384_intern_ckpt_False.py
new file mode 100644
index 00000000..93abb682
--- /dev/null
+++ b/configs/7B_train/16384_intern_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_intern_ckpt_True.py b/configs/7B_train/16384_intern_ckpt_True.py
new file mode 100644
index 00000000..af9d9945
--- /dev/null
+++ b/configs/7B_train/16384_intern_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_megatron_ckpt_False.py b/configs/7B_train/16384_megatron_ckpt_False.py
new file mode 100644
index 00000000..d2c58d3a
--- /dev/null
+++ b/configs/7B_train/16384_megatron_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_megatron_ckpt_True.py b/configs/7B_train/16384_megatron_ckpt_True.py
new file mode 100644
index 00000000..6e372b8c
--- /dev/null
+++ b/configs/7B_train/16384_megatron_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_none_ckpt_False.py b/configs/7B_train/16384_none_ckpt_False.py
new file mode 100644
index 00000000..0fd65900
--- /dev/null
+++ b/configs/7B_train/16384_none_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/16384_none_ckpt_True.py b/configs/7B_train/16384_none_ckpt_True.py
new file mode 100644
index 00000000..6ea5e1a9
--- /dev/null
+++ b/configs/7B_train/16384_none_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 16384
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_flash-attn_ckpt_False.py b/configs/7B_train/262144_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..6dad9730
--- /dev/null
+++ b/configs/7B_train/262144_flash-attn_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_flash-attn_ckpt_True.py b/configs/7B_train/262144_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..cacd9737
--- /dev/null
+++ b/configs/7B_train/262144_flash-attn_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_flash_ckpt_False.py b/configs/7B_train/262144_flash_ckpt_False.py
new file mode 100644
index 00000000..0e9b0173
--- /dev/null
+++ b/configs/7B_train/262144_flash_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_flash_ckpt_True.py b/configs/7B_train/262144_flash_ckpt_True.py
new file mode 100644
index 00000000..ddacc8df
--- /dev/null
+++ b/configs/7B_train/262144_flash_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_intern_ckpt_False.py b/configs/7B_train/262144_intern_ckpt_False.py
new file mode 100644
index 00000000..e5cf7694
--- /dev/null
+++ b/configs/7B_train/262144_intern_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_intern_ckpt_True.py b/configs/7B_train/262144_intern_ckpt_True.py
new file mode 100644
index 00000000..76f9386a
--- /dev/null
+++ b/configs/7B_train/262144_intern_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_megatron_ckpt_False.py b/configs/7B_train/262144_megatron_ckpt_False.py
new file mode 100644
index 00000000..b929f9a6
--- /dev/null
+++ b/configs/7B_train/262144_megatron_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_megatron_ckpt_True.py b/configs/7B_train/262144_megatron_ckpt_True.py
new file mode 100644
index 00000000..1655631c
--- /dev/null
+++ b/configs/7B_train/262144_megatron_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_none_ckpt_False.py b/configs/7B_train/262144_none_ckpt_False.py
new file mode 100644
index 00000000..85512f07
--- /dev/null
+++ b/configs/7B_train/262144_none_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/262144_none_ckpt_True.py b/configs/7B_train/262144_none_ckpt_True.py
new file mode 100644
index 00000000..fef559bd
--- /dev/null
+++ b/configs/7B_train/262144_none_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 262144
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_flash-attn_ckpt_False.py b/configs/7B_train/32768_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..f2664be8
--- /dev/null
+++ b/configs/7B_train/32768_flash-attn_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_flash-attn_ckpt_True.py b/configs/7B_train/32768_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..232b5904
--- /dev/null
+++ b/configs/7B_train/32768_flash-attn_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_flash_ckpt_False.py b/configs/7B_train/32768_flash_ckpt_False.py
new file mode 100644
index 00000000..878b9ac1
--- /dev/null
+++ b/configs/7B_train/32768_flash_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_flash_ckpt_True.py b/configs/7B_train/32768_flash_ckpt_True.py
new file mode 100644
index 00000000..27cffd02
--- /dev/null
+++ b/configs/7B_train/32768_flash_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_intern_ckpt_False.py b/configs/7B_train/32768_intern_ckpt_False.py
new file mode 100644
index 00000000..fcf84197
--- /dev/null
+++ b/configs/7B_train/32768_intern_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_intern_ckpt_True.py b/configs/7B_train/32768_intern_ckpt_True.py
new file mode 100644
index 00000000..aec2b68b
--- /dev/null
+++ b/configs/7B_train/32768_intern_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_megatron_ckpt_False.py b/configs/7B_train/32768_megatron_ckpt_False.py
new file mode 100644
index 00000000..64caeeb5
--- /dev/null
+++ b/configs/7B_train/32768_megatron_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_megatron_ckpt_True.py b/configs/7B_train/32768_megatron_ckpt_True.py
new file mode 100644
index 00000000..a736e7d0
--- /dev/null
+++ b/configs/7B_train/32768_megatron_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_none_ckpt_False.py b/configs/7B_train/32768_none_ckpt_False.py
new file mode 100644
index 00000000..3a31776e
--- /dev/null
+++ b/configs/7B_train/32768_none_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/32768_none_ckpt_True.py b/configs/7B_train/32768_none_ckpt_True.py
new file mode 100644
index 00000000..4ac09249
--- /dev/null
+++ b/configs/7B_train/32768_none_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 32768
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_flash-attn_ckpt_False.py b/configs/7B_train/4096_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..b3de8990
--- /dev/null
+++ b/configs/7B_train/4096_flash-attn_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_flash-attn_ckpt_True.py b/configs/7B_train/4096_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..b44b103f
--- /dev/null
+++ b/configs/7B_train/4096_flash-attn_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_flash_ckpt_False.py b/configs/7B_train/4096_flash_ckpt_False.py
new file mode 100644
index 00000000..8ac542d6
--- /dev/null
+++ b/configs/7B_train/4096_flash_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_flash_ckpt_True.py b/configs/7B_train/4096_flash_ckpt_True.py
new file mode 100644
index 00000000..ec477f68
--- /dev/null
+++ b/configs/7B_train/4096_flash_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_intern_ckpt_False.py b/configs/7B_train/4096_intern_ckpt_False.py
new file mode 100644
index 00000000..f16f95ad
--- /dev/null
+++ b/configs/7B_train/4096_intern_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_intern_ckpt_True.py b/configs/7B_train/4096_intern_ckpt_True.py
new file mode 100644
index 00000000..90fed7c8
--- /dev/null
+++ b/configs/7B_train/4096_intern_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_megatron_ckpt_False.py b/configs/7B_train/4096_megatron_ckpt_False.py
new file mode 100644
index 00000000..ca41fa28
--- /dev/null
+++ b/configs/7B_train/4096_megatron_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_megatron_ckpt_True.py b/configs/7B_train/4096_megatron_ckpt_True.py
new file mode 100644
index 00000000..45183156
--- /dev/null
+++ b/configs/7B_train/4096_megatron_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_none_ckpt_False.py b/configs/7B_train/4096_none_ckpt_False.py
new file mode 100644
index 00000000..c81bb5b9
--- /dev/null
+++ b/configs/7B_train/4096_none_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/4096_none_ckpt_True.py b/configs/7B_train/4096_none_ckpt_True.py
new file mode 100644
index 00000000..a25d222f
--- /dev/null
+++ b/configs/7B_train/4096_none_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_flash-attn_ckpt_False.py b/configs/7B_train/65536_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..3d5a81eb
--- /dev/null
+++ b/configs/7B_train/65536_flash-attn_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_flash-attn_ckpt_True.py b/configs/7B_train/65536_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..c6982c98
--- /dev/null
+++ b/configs/7B_train/65536_flash-attn_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_flash_ckpt_False.py b/configs/7B_train/65536_flash_ckpt_False.py
new file mode 100644
index 00000000..0cfea813
--- /dev/null
+++ b/configs/7B_train/65536_flash_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_flash_ckpt_True.py b/configs/7B_train/65536_flash_ckpt_True.py
new file mode 100644
index 00000000..abdeb49d
--- /dev/null
+++ b/configs/7B_train/65536_flash_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_intern_ckpt_False.py b/configs/7B_train/65536_intern_ckpt_False.py
new file mode 100644
index 00000000..2e0b27e1
--- /dev/null
+++ b/configs/7B_train/65536_intern_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_intern_ckpt_True.py b/configs/7B_train/65536_intern_ckpt_True.py
new file mode 100644
index 00000000..d1a8de7c
--- /dev/null
+++ b/configs/7B_train/65536_intern_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_megatron_ckpt_False.py b/configs/7B_train/65536_megatron_ckpt_False.py
new file mode 100644
index 00000000..7de7b92d
--- /dev/null
+++ b/configs/7B_train/65536_megatron_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_megatron_ckpt_True.py b/configs/7B_train/65536_megatron_ckpt_True.py
new file mode 100644
index 00000000..b339c833
--- /dev/null
+++ b/configs/7B_train/65536_megatron_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_none_ckpt_False.py b/configs/7B_train/65536_none_ckpt_False.py
new file mode 100644
index 00000000..b8c44769
--- /dev/null
+++ b/configs/7B_train/65536_none_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/65536_none_ckpt_True.py b/configs/7B_train/65536_none_ckpt_True.py
new file mode 100644
index 00000000..b907e437
--- /dev/null
+++ b/configs/7B_train/65536_none_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 65536
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_flash-attn_ckpt_False.py b/configs/7B_train/8192_flash-attn_ckpt_False.py
new file mode 100644
index 00000000..d0ddd438
--- /dev/null
+++ b/configs/7B_train/8192_flash-attn_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_flash-attn_ckpt_True.py b/configs/7B_train/8192_flash-attn_ckpt_True.py
new file mode 100644
index 00000000..d9e5b2f9
--- /dev/null
+++ b/configs/7B_train/8192_flash-attn_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_flash_ckpt_False.py b/configs/7B_train/8192_flash_ckpt_False.py
new file mode 100644
index 00000000..69546d11
--- /dev/null
+++ b/configs/7B_train/8192_flash_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_flash_ckpt_True.py b/configs/7B_train/8192_flash_ckpt_True.py
new file mode 100644
index 00000000..4c7f9864
--- /dev/null
+++ b/configs/7B_train/8192_flash_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="flash", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_intern_ckpt_False.py b/configs/7B_train/8192_intern_ckpt_False.py
new file mode 100644
index 00000000..9694ad81
--- /dev/null
+++ b/configs/7B_train/8192_intern_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_intern_ckpt_True.py b/configs/7B_train/8192_intern_ckpt_True.py
new file mode 100644
index 00000000..99a0fc18
--- /dev/null
+++ b/configs/7B_train/8192_intern_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_megatron_ckpt_False.py b/configs/7B_train/8192_megatron_ckpt_False.py
new file mode 100644
index 00000000..f18ee730
--- /dev/null
+++ b/configs/7B_train/8192_megatron_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_megatron_ckpt_True.py b/configs/7B_train/8192_megatron_ckpt_True.py
new file mode 100644
index 00000000..1db58412
--- /dev/null
+++ b/configs/7B_train/8192_megatron_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="megatron", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_none_ckpt_False.py b/configs/7B_train/8192_none_ckpt_False.py
new file mode 100644
index 00000000..95d686bb
--- /dev/null
+++ b/configs/7B_train/8192_none_ckpt_False.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/7B_train/8192_none_ckpt_True.py b/configs/7B_train/8192_none_ckpt_True.py
new file mode 100644
index 00000000..a63b6f20
--- /dev/null
+++ b/configs/7B_train/8192_none_ckpt_True.py
@@ -0,0 +1,181 @@
+# JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 8192
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=1,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=True,
+    total_steps=20,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=100,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
+        defaults to 'none', means the sequence parallel will be disabled.
+    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
+        defaults to False.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1, fsdp=False),
+    tensor=dict(size=8, sp="none", intern_overlap=False),
+    pipeline=dict(size=1, interleaved_overlap=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
diff --git a/configs/generate.py b/configs/generate.py
new file mode 100644
index 00000000..6a58f098
--- /dev/null
+++ b/configs/generate.py
@@ -0,0 +1,44 @@
+import os
+import copy
+import subprocess
+
+name = "./configs/"
+root_names = ["7B_train_", "13B_train_", "30B_train_"]
+model_size = ["7B", "13B", "30B"]
+seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144]
+sp = ["none", "megatron", "flash-attn", "intern"]
+intern_overlap = [False, False, False, True]
+checkpoint = [False, True]
+
+for idx, root_name in enumerate(root_names):
+
+    # 指定要创建的文件夹路径
+    folder_path = name + root_name[:-1]
+
+    # 使用os.mkdir()创建文件夹
+    if not os.path.exists(folder_path):
+        os.mkdir(folder_path)
+
+    file_name = name + f"{model_size[idx]}_template.py"
+
+    with open(file_name, "r") as f:
+        lines = f.readlines()
+        origin_line = "".join(lines)
+        for seq in seq_length:
+            for i, sp_mode in enumerate(sp):
+                for ckpt in checkpoint:
+                    line = copy.copy(origin_line)
+                    line = line.replace("{seq_len}", str(seq))
+                    line = line.replace("{sp}", f"\"{sp_mode}\"")
+                    line = line.replace("{intern_overlap}", str(intern_overlap[i]))
+                    line = line.replace("{checkpoint}", str(ckpt))
+                    output_file_name = str(seq) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py"
+                    write_file = folder_path + "/" + output_file_name
+                    with open(write_file, "w") as file:
+                        file.write(line)
+                        
+                    log_name = root_name + "_" + output_file_name[:-3]
+                    
+                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
+                    process = subprocess.Popen(command, shell=True, executable='/bin/bash')
+                    process.wait() 
\ No newline at end of file
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 5e874d39..0b605e53 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -396,6 +396,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
 
 
 tgs_list = []
+tflops_list = []
 
 
 @llm_timeout(func_name="record_current_batch_training_metrics")
@@ -573,6 +574,7 @@ def record_current_batch_training_metrics(
 
         if batch_count >= 5:
             tgs_list.append(tgs_origin)
+            tflops_list.append(tflops)
         if batch_count == gpc.config.data.total_steps - 1:
             print(tgs_list, flush=True)
             avg_tgs = sum(tgs_list) / len(tgs_list)
@@ -580,3 +582,9 @@ def record_current_batch_training_metrics(
                 if abs(tgs - avg_tgs) > 400:
                     tgs_list.remove(tgs)
             print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True)
+            print(tflops_list, flush=True)
+            avg_tflops = sum(tflops_list) / len(tflops_list)
+            for tf in tflops_list.copy():
+                if abs(tf - avg_tflops) > 10:
+                    tflops_list.remove(tf)
+            print(f"avg_tflops: {sum(tflops_list)/len(tflops_list)}", flush=True)

From 41cfa1a10a673e74c64653afda8395309c0f7d75 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 24 Oct 2023 18:47:27 +0800
Subject: [PATCH 053/153] feat(model/overlap_handler.py): fix overlap handler
 None bug

---
 internlm/model/overlap_handler.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 5cef92f9..35d8a594 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -312,7 +312,8 @@ def __init__(self, overlap_handler: FSTPOverlapHandler) -> None:
         self._overlap_handler = overlap_handler
 
     def before_forward(self, scheduler, inputs) -> None:
-        self._overlap_handler.set_forward_mode(True)
+        if self._overlap_handler is not None:
+            self._overlap_handler.set_forward_mode(True)
 
     def after_forward(self, scheduler, outputs) -> None:
         pass
@@ -324,7 +325,8 @@ def after_criterion(self, scheduler, loss) -> None:
         pass
 
     def before_backward(self, scheduler, outputs, outputs_grad) -> None:
-        self._overlap_handler.set_forward_mode(False)
+        if self._overlap_handler is not None:
+            self._overlap_handler.set_forward_mode(False)
 
     def after_backward(self, scheduler, inputs_grad) -> None:
         pass

From 0bac166b7a82a556e8f2ba301a4e4f6d353c8b1f Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Wed, 25 Oct 2023 13:44:15 +0800
Subject: [PATCH 054/153] add test

---
 .gitignore                                    |   4 +
 configs/13B_template.py                       |   8 +-
 .../13B_train/131072_flash-attn_ckpt_False.py | 180 -----------------
 .../13B_train/131072_flash-attn_ckpt_True.py  | 180 -----------------
 configs/13B_train/131072_flash_ckpt_False.py  | 180 -----------------
 configs/13B_train/131072_flash_ckpt_True.py   | 180 -----------------
 configs/13B_train/131072_intern_ckpt_False.py | 180 -----------------
 configs/13B_train/131072_intern_ckpt_True.py  | 180 -----------------
 .../13B_train/131072_megatron_ckpt_False.py   | 180 -----------------
 .../13B_train/131072_megatron_ckpt_True.py    | 180 -----------------
 configs/13B_train/131072_none_ckpt_False.py   | 180 -----------------
 configs/13B_train/131072_none_ckpt_True.py    | 180 -----------------
 .../13B_train/16384_flash-attn_ckpt_False.py  | 180 -----------------
 .../13B_train/16384_flash-attn_ckpt_True.py   | 180 -----------------
 configs/13B_train/16384_flash_ckpt_False.py   | 180 -----------------
 configs/13B_train/16384_flash_ckpt_True.py    | 180 -----------------
 configs/13B_train/16384_intern_ckpt_False.py  | 180 -----------------
 configs/13B_train/16384_intern_ckpt_True.py   | 180 -----------------
 .../13B_train/16384_megatron_ckpt_False.py    | 180 -----------------
 configs/13B_train/16384_megatron_ckpt_True.py | 180 -----------------
 configs/13B_train/16384_none_ckpt_False.py    | 180 -----------------
 configs/13B_train/16384_none_ckpt_True.py     | 180 -----------------
 .../13B_train/262144_flash-attn_ckpt_False.py | 180 -----------------
 configs/13B_train/262144_flash_ckpt_False.py  | 180 -----------------
 configs/13B_train/262144_flash_ckpt_True.py   | 180 -----------------
 configs/13B_train/262144_intern_ckpt_False.py | 180 -----------------
 configs/13B_train/262144_intern_ckpt_True.py  | 180 -----------------
 .../13B_train/262144_megatron_ckpt_False.py   | 180 -----------------
 .../13B_train/262144_megatron_ckpt_True.py    | 180 -----------------
 configs/13B_train/262144_none_ckpt_False.py   | 180 -----------------
 configs/13B_train/262144_none_ckpt_True.py    | 180 -----------------
 .../13B_train/32768_flash-attn_ckpt_False.py  | 180 -----------------
 .../13B_train/32768_flash-attn_ckpt_True.py   | 180 -----------------
 configs/13B_train/32768_flash_ckpt_False.py   | 180 -----------------
 configs/13B_train/32768_flash_ckpt_True.py    | 180 -----------------
 configs/13B_train/32768_intern_ckpt_False.py  | 180 -----------------
 configs/13B_train/32768_intern_ckpt_True.py   | 180 -----------------
 .../13B_train/32768_megatron_ckpt_False.py    | 180 -----------------
 configs/13B_train/32768_megatron_ckpt_True.py | 180 -----------------
 configs/13B_train/32768_none_ckpt_False.py    | 180 -----------------
 configs/13B_train/32768_none_ckpt_True.py     | 180 -----------------
 .../13B_train/4096_flash-attn_ckpt_False.py   | 180 -----------------
 .../13B_train/4096_flash-attn_ckpt_True.py    | 180 -----------------
 configs/13B_train/4096_flash_ckpt_False.py    | 180 -----------------
 configs/13B_train/4096_flash_ckpt_True.py     | 180 -----------------
 configs/13B_train/4096_intern_ckpt_False.py   | 180 -----------------
 configs/13B_train/4096_intern_ckpt_True.py    | 180 -----------------
 configs/13B_train/4096_megatron_ckpt_False.py | 180 -----------------
 configs/13B_train/4096_megatron_ckpt_True.py  | 180 -----------------
 configs/13B_train/4096_none_ckpt_False.py     | 180 -----------------
 configs/13B_train/4096_none_ckpt_True.py      | 180 -----------------
 .../13B_train/65536_flash-attn_ckpt_False.py  | 180 -----------------
 .../13B_train/65536_flash-attn_ckpt_True.py   | 180 -----------------
 configs/13B_train/65536_flash_ckpt_False.py   | 180 -----------------
 configs/13B_train/65536_flash_ckpt_True.py    | 180 -----------------
 configs/13B_train/65536_intern_ckpt_False.py  | 180 -----------------
 configs/13B_train/65536_intern_ckpt_True.py   | 180 -----------------
 .../13B_train/65536_megatron_ckpt_False.py    | 180 -----------------
 configs/13B_train/65536_megatron_ckpt_True.py | 180 -----------------
 configs/13B_train/65536_none_ckpt_False.py    | 180 -----------------
 configs/13B_train/65536_none_ckpt_True.py     | 180 -----------------
 .../13B_train/8192_flash-attn_ckpt_False.py   | 180 -----------------
 .../13B_train/8192_flash-attn_ckpt_True.py    | 180 -----------------
 configs/13B_train/8192_flash_ckpt_False.py    | 180 -----------------
 configs/13B_train/8192_flash_ckpt_True.py     | 180 -----------------
 configs/13B_train/8192_intern_ckpt_False.py   | 180 -----------------
 configs/13B_train/8192_intern_ckpt_True.py    | 180 -----------------
 configs/13B_train/8192_megatron_ckpt_False.py | 180 -----------------
 configs/13B_train/8192_megatron_ckpt_True.py  | 180 -----------------
 configs/13B_train/8192_none_ckpt_False.py     | 180 -----------------
 configs/13B_train/8192_none_ckpt_True.py      | 180 -----------------
 configs/30B_template.py                       |   8 +-
 configs/30B_train/131072_flash_ckpt_False.py  | 180 -----------------
 configs/30B_train/131072_flash_ckpt_True.py   | 180 -----------------
 configs/30B_train/131072_intern_ckpt_False.py | 180 -----------------
 configs/30B_train/131072_intern_ckpt_True.py  | 180 -----------------
 .../30B_train/131072_megatron_ckpt_False.py   | 180 -----------------
 .../30B_train/131072_megatron_ckpt_True.py    | 180 -----------------
 configs/30B_train/131072_none_ckpt_False.py   | 180 -----------------
 configs/30B_train/131072_none_ckpt_True.py    | 180 -----------------
 configs/30B_train/16384_flash_ckpt_False.py   | 180 -----------------
 configs/30B_train/16384_flash_ckpt_True.py    | 180 -----------------
 configs/30B_train/16384_intern_ckpt_False.py  | 180 -----------------
 configs/30B_train/16384_intern_ckpt_True.py   | 180 -----------------
 .../30B_train/16384_megatron_ckpt_False.py    | 180 -----------------
 configs/30B_train/16384_megatron_ckpt_True.py | 180 -----------------
 configs/30B_train/16384_none_ckpt_False.py    | 180 -----------------
 configs/30B_train/16384_none_ckpt_True.py     | 180 -----------------
 configs/30B_train/262144_flash_ckpt_False.py  | 180 -----------------
 configs/30B_train/262144_flash_ckpt_True.py   | 180 -----------------
 configs/30B_train/262144_intern_ckpt_False.py | 180 -----------------
 configs/30B_train/262144_intern_ckpt_True.py  | 180 -----------------
 .../30B_train/262144_megatron_ckpt_False.py   | 180 -----------------
 .../30B_train/262144_megatron_ckpt_True.py    | 180 -----------------
 configs/30B_train/262144_none_ckpt_False.py   | 180 -----------------
 configs/30B_train/262144_none_ckpt_True.py    | 180 -----------------
 configs/30B_train/32768_flash_ckpt_False.py   | 180 -----------------
 configs/30B_train/32768_flash_ckpt_True.py    | 180 -----------------
 configs/30B_train/32768_intern_ckpt_False.py  | 180 -----------------
 configs/30B_train/32768_intern_ckpt_True.py   | 180 -----------------
 .../30B_train/32768_megatron_ckpt_False.py    | 180 -----------------
 configs/30B_train/32768_megatron_ckpt_True.py | 180 -----------------
 configs/30B_train/32768_none_ckpt_False.py    | 180 -----------------
 configs/30B_train/32768_none_ckpt_True.py     | 180 -----------------
 configs/30B_train/4096_flash_ckpt_False.py    | 180 -----------------
 configs/30B_train/4096_flash_ckpt_True.py     | 180 -----------------
 configs/30B_train/4096_intern_ckpt_False.py   | 180 -----------------
 configs/30B_train/4096_intern_ckpt_True.py    | 180 -----------------
 configs/30B_train/4096_megatron_ckpt_False.py | 180 -----------------
 configs/30B_train/4096_megatron_ckpt_True.py  | 180 -----------------
 configs/30B_train/4096_none_ckpt_False.py     | 180 -----------------
 configs/30B_train/4096_none_ckpt_True.py      | 180 -----------------
 configs/30B_train/65536_flash_ckpt_False.py   | 180 -----------------
 configs/30B_train/65536_flash_ckpt_True.py    | 180 -----------------
 configs/30B_train/65536_intern_ckpt_False.py  | 180 -----------------
 configs/30B_train/65536_intern_ckpt_True.py   | 180 -----------------
 .../30B_train/65536_megatron_ckpt_False.py    | 180 -----------------
 configs/30B_train/65536_megatron_ckpt_True.py | 180 -----------------
 configs/30B_train/65536_none_ckpt_False.py    | 180 -----------------
 configs/30B_train/65536_none_ckpt_True.py     | 180 -----------------
 configs/30B_train/8192_flash_ckpt_False.py    | 180 -----------------
 configs/30B_train/8192_flash_ckpt_True.py     | 180 -----------------
 configs/30B_train/8192_intern_ckpt_False.py   | 180 -----------------
 configs/30B_train/8192_intern_ckpt_True.py    | 180 -----------------
 configs/30B_train/8192_megatron_ckpt_False.py | 180 -----------------
 configs/30B_train/8192_megatron_ckpt_True.py  | 180 -----------------
 configs/30B_train/8192_none_ckpt_False.py     | 180 -----------------
 configs/30B_train/8192_none_ckpt_True.py      | 180 -----------------
 configs/7B_template.py                        |   2 +-
 .../7B_train/131072_flash-attn_ckpt_False.py  | 181 ------------------
 .../7B_train/131072_flash-attn_ckpt_True.py   | 181 ------------------
 configs/7B_train/131072_flash_ckpt_False.py   | 181 ------------------
 configs/7B_train/131072_flash_ckpt_True.py    | 181 ------------------
 configs/7B_train/131072_intern_ckpt_False.py  | 181 ------------------
 configs/7B_train/131072_intern_ckpt_True.py   | 181 ------------------
 .../7B_train/131072_megatron_ckpt_False.py    | 181 ------------------
 configs/7B_train/131072_megatron_ckpt_True.py | 181 ------------------
 configs/7B_train/131072_none_ckpt_False.py    | 181 ------------------
 configs/7B_train/131072_none_ckpt_True.py     | 181 ------------------
 .../7B_train/16384_flash-attn_ckpt_False.py   | 181 ------------------
 .../7B_train/16384_flash-attn_ckpt_True.py    | 181 ------------------
 configs/7B_train/16384_flash_ckpt_False.py    | 181 ------------------
 configs/7B_train/16384_flash_ckpt_True.py     | 181 ------------------
 configs/7B_train/16384_intern_ckpt_False.py   | 181 ------------------
 configs/7B_train/16384_intern_ckpt_True.py    | 181 ------------------
 configs/7B_train/16384_megatron_ckpt_False.py | 181 ------------------
 configs/7B_train/16384_megatron_ckpt_True.py  | 181 ------------------
 configs/7B_train/16384_none_ckpt_False.py     | 181 ------------------
 configs/7B_train/16384_none_ckpt_True.py      | 181 ------------------
 .../7B_train/262144_flash-attn_ckpt_False.py  | 181 ------------------
 .../7B_train/262144_flash-attn_ckpt_True.py   | 181 ------------------
 configs/7B_train/262144_flash_ckpt_False.py   | 181 ------------------
 configs/7B_train/262144_flash_ckpt_True.py    | 181 ------------------
 configs/7B_train/262144_intern_ckpt_False.py  | 181 ------------------
 configs/7B_train/262144_intern_ckpt_True.py   | 181 ------------------
 .../7B_train/262144_megatron_ckpt_False.py    | 181 ------------------
 configs/7B_train/262144_megatron_ckpt_True.py | 181 ------------------
 configs/7B_train/262144_none_ckpt_False.py    | 181 ------------------
 configs/7B_train/262144_none_ckpt_True.py     | 181 ------------------
 .../7B_train/32768_flash-attn_ckpt_False.py   | 181 ------------------
 .../7B_train/32768_flash-attn_ckpt_True.py    | 181 ------------------
 configs/7B_train/32768_flash_ckpt_False.py    | 181 ------------------
 configs/7B_train/32768_flash_ckpt_True.py     | 181 ------------------
 configs/7B_train/32768_intern_ckpt_False.py   | 181 ------------------
 configs/7B_train/32768_intern_ckpt_True.py    | 181 ------------------
 configs/7B_train/32768_megatron_ckpt_False.py | 181 ------------------
 configs/7B_train/32768_megatron_ckpt_True.py  | 181 ------------------
 configs/7B_train/32768_none_ckpt_False.py     | 181 ------------------
 configs/7B_train/32768_none_ckpt_True.py      | 181 ------------------
 .../7B_train/4096_flash-attn_ckpt_False.py    | 181 ------------------
 configs/7B_train/4096_flash-attn_ckpt_True.py | 181 ------------------
 configs/7B_train/4096_flash_ckpt_False.py     | 181 ------------------
 configs/7B_train/4096_flash_ckpt_True.py      | 181 ------------------
 configs/7B_train/4096_intern_ckpt_False.py    | 181 ------------------
 configs/7B_train/4096_intern_ckpt_True.py     | 181 ------------------
 configs/7B_train/4096_megatron_ckpt_False.py  | 181 ------------------
 configs/7B_train/4096_megatron_ckpt_True.py   | 181 ------------------
 configs/7B_train/4096_none_ckpt_False.py      | 181 ------------------
 configs/7B_train/4096_none_ckpt_True.py       | 181 ------------------
 .../7B_train/65536_flash-attn_ckpt_False.py   | 181 ------------------
 .../7B_train/65536_flash-attn_ckpt_True.py    | 181 ------------------
 configs/7B_train/65536_flash_ckpt_False.py    | 181 ------------------
 configs/7B_train/65536_flash_ckpt_True.py     | 181 ------------------
 configs/7B_train/65536_intern_ckpt_False.py   | 181 ------------------
 configs/7B_train/65536_intern_ckpt_True.py    | 181 ------------------
 configs/7B_train/65536_megatron_ckpt_False.py | 181 ------------------
 configs/7B_train/65536_megatron_ckpt_True.py  | 181 ------------------
 configs/7B_train/65536_none_ckpt_False.py     | 181 ------------------
 configs/7B_train/65536_none_ckpt_True.py      | 181 ------------------
 .../7B_train/8192_flash-attn_ckpt_False.py    | 181 ------------------
 configs/7B_train/8192_flash-attn_ckpt_True.py | 181 ------------------
 configs/7B_train/8192_flash_ckpt_False.py     | 181 ------------------
 configs/7B_train/8192_flash_ckpt_True.py      | 181 ------------------
 configs/7B_train/8192_intern_ckpt_False.py    | 181 ------------------
 configs/7B_train/8192_intern_ckpt_True.py     | 181 ------------------
 configs/7B_train/8192_megatron_ckpt_False.py  | 181 ------------------
 configs/7B_train/8192_megatron_ckpt_True.py   | 181 ------------------
 configs/7B_train/8192_none_ckpt_False.py      | 181 ------------------
 configs/7B_train/8192_none_ckpt_True.py       | 181 ------------------
 configs/generate.py                           |  24 ++-
 200 files changed, 33 insertions(+), 35183 deletions(-)
 delete mode 100644 configs/13B_train/131072_flash-attn_ckpt_False.py
 delete mode 100644 configs/13B_train/131072_flash-attn_ckpt_True.py
 delete mode 100644 configs/13B_train/131072_flash_ckpt_False.py
 delete mode 100644 configs/13B_train/131072_flash_ckpt_True.py
 delete mode 100644 configs/13B_train/131072_intern_ckpt_False.py
 delete mode 100644 configs/13B_train/131072_intern_ckpt_True.py
 delete mode 100644 configs/13B_train/131072_megatron_ckpt_False.py
 delete mode 100644 configs/13B_train/131072_megatron_ckpt_True.py
 delete mode 100644 configs/13B_train/131072_none_ckpt_False.py
 delete mode 100644 configs/13B_train/131072_none_ckpt_True.py
 delete mode 100644 configs/13B_train/16384_flash-attn_ckpt_False.py
 delete mode 100644 configs/13B_train/16384_flash-attn_ckpt_True.py
 delete mode 100644 configs/13B_train/16384_flash_ckpt_False.py
 delete mode 100644 configs/13B_train/16384_flash_ckpt_True.py
 delete mode 100644 configs/13B_train/16384_intern_ckpt_False.py
 delete mode 100644 configs/13B_train/16384_intern_ckpt_True.py
 delete mode 100644 configs/13B_train/16384_megatron_ckpt_False.py
 delete mode 100644 configs/13B_train/16384_megatron_ckpt_True.py
 delete mode 100644 configs/13B_train/16384_none_ckpt_False.py
 delete mode 100644 configs/13B_train/16384_none_ckpt_True.py
 delete mode 100644 configs/13B_train/262144_flash-attn_ckpt_False.py
 delete mode 100644 configs/13B_train/262144_flash_ckpt_False.py
 delete mode 100644 configs/13B_train/262144_flash_ckpt_True.py
 delete mode 100644 configs/13B_train/262144_intern_ckpt_False.py
 delete mode 100644 configs/13B_train/262144_intern_ckpt_True.py
 delete mode 100644 configs/13B_train/262144_megatron_ckpt_False.py
 delete mode 100644 configs/13B_train/262144_megatron_ckpt_True.py
 delete mode 100644 configs/13B_train/262144_none_ckpt_False.py
 delete mode 100644 configs/13B_train/262144_none_ckpt_True.py
 delete mode 100644 configs/13B_train/32768_flash-attn_ckpt_False.py
 delete mode 100644 configs/13B_train/32768_flash-attn_ckpt_True.py
 delete mode 100644 configs/13B_train/32768_flash_ckpt_False.py
 delete mode 100644 configs/13B_train/32768_flash_ckpt_True.py
 delete mode 100644 configs/13B_train/32768_intern_ckpt_False.py
 delete mode 100644 configs/13B_train/32768_intern_ckpt_True.py
 delete mode 100644 configs/13B_train/32768_megatron_ckpt_False.py
 delete mode 100644 configs/13B_train/32768_megatron_ckpt_True.py
 delete mode 100644 configs/13B_train/32768_none_ckpt_False.py
 delete mode 100644 configs/13B_train/32768_none_ckpt_True.py
 delete mode 100644 configs/13B_train/4096_flash-attn_ckpt_False.py
 delete mode 100644 configs/13B_train/4096_flash-attn_ckpt_True.py
 delete mode 100644 configs/13B_train/4096_flash_ckpt_False.py
 delete mode 100644 configs/13B_train/4096_flash_ckpt_True.py
 delete mode 100644 configs/13B_train/4096_intern_ckpt_False.py
 delete mode 100644 configs/13B_train/4096_intern_ckpt_True.py
 delete mode 100644 configs/13B_train/4096_megatron_ckpt_False.py
 delete mode 100644 configs/13B_train/4096_megatron_ckpt_True.py
 delete mode 100644 configs/13B_train/4096_none_ckpt_False.py
 delete mode 100644 configs/13B_train/4096_none_ckpt_True.py
 delete mode 100644 configs/13B_train/65536_flash-attn_ckpt_False.py
 delete mode 100644 configs/13B_train/65536_flash-attn_ckpt_True.py
 delete mode 100644 configs/13B_train/65536_flash_ckpt_False.py
 delete mode 100644 configs/13B_train/65536_flash_ckpt_True.py
 delete mode 100644 configs/13B_train/65536_intern_ckpt_False.py
 delete mode 100644 configs/13B_train/65536_intern_ckpt_True.py
 delete mode 100644 configs/13B_train/65536_megatron_ckpt_False.py
 delete mode 100644 configs/13B_train/65536_megatron_ckpt_True.py
 delete mode 100644 configs/13B_train/65536_none_ckpt_False.py
 delete mode 100644 configs/13B_train/65536_none_ckpt_True.py
 delete mode 100644 configs/13B_train/8192_flash-attn_ckpt_False.py
 delete mode 100644 configs/13B_train/8192_flash-attn_ckpt_True.py
 delete mode 100644 configs/13B_train/8192_flash_ckpt_False.py
 delete mode 100644 configs/13B_train/8192_flash_ckpt_True.py
 delete mode 100644 configs/13B_train/8192_intern_ckpt_False.py
 delete mode 100644 configs/13B_train/8192_intern_ckpt_True.py
 delete mode 100644 configs/13B_train/8192_megatron_ckpt_False.py
 delete mode 100644 configs/13B_train/8192_megatron_ckpt_True.py
 delete mode 100644 configs/13B_train/8192_none_ckpt_False.py
 delete mode 100644 configs/13B_train/8192_none_ckpt_True.py
 delete mode 100644 configs/30B_train/131072_flash_ckpt_False.py
 delete mode 100644 configs/30B_train/131072_flash_ckpt_True.py
 delete mode 100644 configs/30B_train/131072_intern_ckpt_False.py
 delete mode 100644 configs/30B_train/131072_intern_ckpt_True.py
 delete mode 100644 configs/30B_train/131072_megatron_ckpt_False.py
 delete mode 100644 configs/30B_train/131072_megatron_ckpt_True.py
 delete mode 100644 configs/30B_train/131072_none_ckpt_False.py
 delete mode 100644 configs/30B_train/131072_none_ckpt_True.py
 delete mode 100644 configs/30B_train/16384_flash_ckpt_False.py
 delete mode 100644 configs/30B_train/16384_flash_ckpt_True.py
 delete mode 100644 configs/30B_train/16384_intern_ckpt_False.py
 delete mode 100644 configs/30B_train/16384_intern_ckpt_True.py
 delete mode 100644 configs/30B_train/16384_megatron_ckpt_False.py
 delete mode 100644 configs/30B_train/16384_megatron_ckpt_True.py
 delete mode 100644 configs/30B_train/16384_none_ckpt_False.py
 delete mode 100644 configs/30B_train/16384_none_ckpt_True.py
 delete mode 100644 configs/30B_train/262144_flash_ckpt_False.py
 delete mode 100644 configs/30B_train/262144_flash_ckpt_True.py
 delete mode 100644 configs/30B_train/262144_intern_ckpt_False.py
 delete mode 100644 configs/30B_train/262144_intern_ckpt_True.py
 delete mode 100644 configs/30B_train/262144_megatron_ckpt_False.py
 delete mode 100644 configs/30B_train/262144_megatron_ckpt_True.py
 delete mode 100644 configs/30B_train/262144_none_ckpt_False.py
 delete mode 100644 configs/30B_train/262144_none_ckpt_True.py
 delete mode 100644 configs/30B_train/32768_flash_ckpt_False.py
 delete mode 100644 configs/30B_train/32768_flash_ckpt_True.py
 delete mode 100644 configs/30B_train/32768_intern_ckpt_False.py
 delete mode 100644 configs/30B_train/32768_intern_ckpt_True.py
 delete mode 100644 configs/30B_train/32768_megatron_ckpt_False.py
 delete mode 100644 configs/30B_train/32768_megatron_ckpt_True.py
 delete mode 100644 configs/30B_train/32768_none_ckpt_False.py
 delete mode 100644 configs/30B_train/32768_none_ckpt_True.py
 delete mode 100644 configs/30B_train/4096_flash_ckpt_False.py
 delete mode 100644 configs/30B_train/4096_flash_ckpt_True.py
 delete mode 100644 configs/30B_train/4096_intern_ckpt_False.py
 delete mode 100644 configs/30B_train/4096_intern_ckpt_True.py
 delete mode 100644 configs/30B_train/4096_megatron_ckpt_False.py
 delete mode 100644 configs/30B_train/4096_megatron_ckpt_True.py
 delete mode 100644 configs/30B_train/4096_none_ckpt_False.py
 delete mode 100644 configs/30B_train/4096_none_ckpt_True.py
 delete mode 100644 configs/30B_train/65536_flash_ckpt_False.py
 delete mode 100644 configs/30B_train/65536_flash_ckpt_True.py
 delete mode 100644 configs/30B_train/65536_intern_ckpt_False.py
 delete mode 100644 configs/30B_train/65536_intern_ckpt_True.py
 delete mode 100644 configs/30B_train/65536_megatron_ckpt_False.py
 delete mode 100644 configs/30B_train/65536_megatron_ckpt_True.py
 delete mode 100644 configs/30B_train/65536_none_ckpt_False.py
 delete mode 100644 configs/30B_train/65536_none_ckpt_True.py
 delete mode 100644 configs/30B_train/8192_flash_ckpt_False.py
 delete mode 100644 configs/30B_train/8192_flash_ckpt_True.py
 delete mode 100644 configs/30B_train/8192_intern_ckpt_False.py
 delete mode 100644 configs/30B_train/8192_intern_ckpt_True.py
 delete mode 100644 configs/30B_train/8192_megatron_ckpt_False.py
 delete mode 100644 configs/30B_train/8192_megatron_ckpt_True.py
 delete mode 100644 configs/30B_train/8192_none_ckpt_False.py
 delete mode 100644 configs/30B_train/8192_none_ckpt_True.py
 delete mode 100644 configs/7B_train/131072_flash-attn_ckpt_False.py
 delete mode 100644 configs/7B_train/131072_flash-attn_ckpt_True.py
 delete mode 100644 configs/7B_train/131072_flash_ckpt_False.py
 delete mode 100644 configs/7B_train/131072_flash_ckpt_True.py
 delete mode 100644 configs/7B_train/131072_intern_ckpt_False.py
 delete mode 100644 configs/7B_train/131072_intern_ckpt_True.py
 delete mode 100644 configs/7B_train/131072_megatron_ckpt_False.py
 delete mode 100644 configs/7B_train/131072_megatron_ckpt_True.py
 delete mode 100644 configs/7B_train/131072_none_ckpt_False.py
 delete mode 100644 configs/7B_train/131072_none_ckpt_True.py
 delete mode 100644 configs/7B_train/16384_flash-attn_ckpt_False.py
 delete mode 100644 configs/7B_train/16384_flash-attn_ckpt_True.py
 delete mode 100644 configs/7B_train/16384_flash_ckpt_False.py
 delete mode 100644 configs/7B_train/16384_flash_ckpt_True.py
 delete mode 100644 configs/7B_train/16384_intern_ckpt_False.py
 delete mode 100644 configs/7B_train/16384_intern_ckpt_True.py
 delete mode 100644 configs/7B_train/16384_megatron_ckpt_False.py
 delete mode 100644 configs/7B_train/16384_megatron_ckpt_True.py
 delete mode 100644 configs/7B_train/16384_none_ckpt_False.py
 delete mode 100644 configs/7B_train/16384_none_ckpt_True.py
 delete mode 100644 configs/7B_train/262144_flash-attn_ckpt_False.py
 delete mode 100644 configs/7B_train/262144_flash-attn_ckpt_True.py
 delete mode 100644 configs/7B_train/262144_flash_ckpt_False.py
 delete mode 100644 configs/7B_train/262144_flash_ckpt_True.py
 delete mode 100644 configs/7B_train/262144_intern_ckpt_False.py
 delete mode 100644 configs/7B_train/262144_intern_ckpt_True.py
 delete mode 100644 configs/7B_train/262144_megatron_ckpt_False.py
 delete mode 100644 configs/7B_train/262144_megatron_ckpt_True.py
 delete mode 100644 configs/7B_train/262144_none_ckpt_False.py
 delete mode 100644 configs/7B_train/262144_none_ckpt_True.py
 delete mode 100644 configs/7B_train/32768_flash-attn_ckpt_False.py
 delete mode 100644 configs/7B_train/32768_flash-attn_ckpt_True.py
 delete mode 100644 configs/7B_train/32768_flash_ckpt_False.py
 delete mode 100644 configs/7B_train/32768_flash_ckpt_True.py
 delete mode 100644 configs/7B_train/32768_intern_ckpt_False.py
 delete mode 100644 configs/7B_train/32768_intern_ckpt_True.py
 delete mode 100644 configs/7B_train/32768_megatron_ckpt_False.py
 delete mode 100644 configs/7B_train/32768_megatron_ckpt_True.py
 delete mode 100644 configs/7B_train/32768_none_ckpt_False.py
 delete mode 100644 configs/7B_train/32768_none_ckpt_True.py
 delete mode 100644 configs/7B_train/4096_flash-attn_ckpt_False.py
 delete mode 100644 configs/7B_train/4096_flash-attn_ckpt_True.py
 delete mode 100644 configs/7B_train/4096_flash_ckpt_False.py
 delete mode 100644 configs/7B_train/4096_flash_ckpt_True.py
 delete mode 100644 configs/7B_train/4096_intern_ckpt_False.py
 delete mode 100644 configs/7B_train/4096_intern_ckpt_True.py
 delete mode 100644 configs/7B_train/4096_megatron_ckpt_False.py
 delete mode 100644 configs/7B_train/4096_megatron_ckpt_True.py
 delete mode 100644 configs/7B_train/4096_none_ckpt_False.py
 delete mode 100644 configs/7B_train/4096_none_ckpt_True.py
 delete mode 100644 configs/7B_train/65536_flash-attn_ckpt_False.py
 delete mode 100644 configs/7B_train/65536_flash-attn_ckpt_True.py
 delete mode 100644 configs/7B_train/65536_flash_ckpt_False.py
 delete mode 100644 configs/7B_train/65536_flash_ckpt_True.py
 delete mode 100644 configs/7B_train/65536_intern_ckpt_False.py
 delete mode 100644 configs/7B_train/65536_intern_ckpt_True.py
 delete mode 100644 configs/7B_train/65536_megatron_ckpt_False.py
 delete mode 100644 configs/7B_train/65536_megatron_ckpt_True.py
 delete mode 100644 configs/7B_train/65536_none_ckpt_False.py
 delete mode 100644 configs/7B_train/65536_none_ckpt_True.py
 delete mode 100644 configs/7B_train/8192_flash-attn_ckpt_False.py
 delete mode 100644 configs/7B_train/8192_flash-attn_ckpt_True.py
 delete mode 100644 configs/7B_train/8192_flash_ckpt_False.py
 delete mode 100644 configs/7B_train/8192_flash_ckpt_True.py
 delete mode 100644 configs/7B_train/8192_intern_ckpt_False.py
 delete mode 100644 configs/7B_train/8192_intern_ckpt_True.py
 delete mode 100644 configs/7B_train/8192_megatron_ckpt_False.py
 delete mode 100644 configs/7B_train/8192_megatron_ckpt_True.py
 delete mode 100644 configs/7B_train/8192_none_ckpt_False.py
 delete mode 100644 configs/7B_train/8192_none_ckpt_True.py

diff --git a/.gitignore b/.gitignore
index 04367e3d..9bdc7ec7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,5 +149,9 @@ memory_trace
 13b_train*/
 30b_train*/
 fstp_logs/
+configs/7B_train/*
+configs/13B_train/*
+configs/30B_train/*
+
 atb
 pip
diff --git a/configs/13B_template.py b/configs/13B_template.py
index 26be3f71..e0e016cc 100644
--- a/configs/13B_template.py
+++ b/configs/13B_template.py
@@ -2,7 +2,7 @@
 DO_ALERT = False
 
 SEQ_LEN = {seq_len}
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 5120
 NUM_ATTENTION_HEAD = 40
 MLP_RATIO = 8 / 3
@@ -50,9 +50,9 @@
 data = dict(
     seq_len=SEQ_LEN,
     # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
+    micro_num=1,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
+    micro_bsz=1,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -91,7 +91,7 @@
 hybrid_zero_optimizer = dict(
     # Enable low_level_optimzer overlap_communication
     overlap_sync_grad=True,
-    overlap_sync_param=True,
+    overlap_sync_param=False,
     # bucket size for nccl communication params
     reduce_bucket_size=512 * 1024 * 1024,
     # grad clipping
diff --git a/configs/13B_train/131072_flash-attn_ckpt_False.py b/configs/13B_train/131072_flash-attn_ckpt_False.py
deleted file mode 100644
index 28d51af6..00000000
--- a/configs/13B_train/131072_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_flash-attn_ckpt_True.py b/configs/13B_train/131072_flash-attn_ckpt_True.py
deleted file mode 100644
index 6d1b7ef0..00000000
--- a/configs/13B_train/131072_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_flash_ckpt_False.py b/configs/13B_train/131072_flash_ckpt_False.py
deleted file mode 100644
index dd0f0e89..00000000
--- a/configs/13B_train/131072_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_flash_ckpt_True.py b/configs/13B_train/131072_flash_ckpt_True.py
deleted file mode 100644
index 2b9276db..00000000
--- a/configs/13B_train/131072_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_intern_ckpt_False.py b/configs/13B_train/131072_intern_ckpt_False.py
deleted file mode 100644
index 182e4ddb..00000000
--- a/configs/13B_train/131072_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_intern_ckpt_True.py b/configs/13B_train/131072_intern_ckpt_True.py
deleted file mode 100644
index c23a3c10..00000000
--- a/configs/13B_train/131072_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_megatron_ckpt_False.py b/configs/13B_train/131072_megatron_ckpt_False.py
deleted file mode 100644
index 935ff98d..00000000
--- a/configs/13B_train/131072_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_megatron_ckpt_True.py b/configs/13B_train/131072_megatron_ckpt_True.py
deleted file mode 100644
index 441166c2..00000000
--- a/configs/13B_train/131072_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_none_ckpt_False.py b/configs/13B_train/131072_none_ckpt_False.py
deleted file mode 100644
index e43d6044..00000000
--- a/configs/13B_train/131072_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_none_ckpt_True.py b/configs/13B_train/131072_none_ckpt_True.py
deleted file mode 100644
index 0945dbdc..00000000
--- a/configs/13B_train/131072_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash-attn_ckpt_False.py b/configs/13B_train/16384_flash-attn_ckpt_False.py
deleted file mode 100644
index 393e54d3..00000000
--- a/configs/13B_train/16384_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash-attn_ckpt_True.py b/configs/13B_train/16384_flash-attn_ckpt_True.py
deleted file mode 100644
index 7f7e7ac6..00000000
--- a/configs/13B_train/16384_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash_ckpt_False.py b/configs/13B_train/16384_flash_ckpt_False.py
deleted file mode 100644
index cadd215f..00000000
--- a/configs/13B_train/16384_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash_ckpt_True.py b/configs/13B_train/16384_flash_ckpt_True.py
deleted file mode 100644
index c60ea730..00000000
--- a/configs/13B_train/16384_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_intern_ckpt_False.py b/configs/13B_train/16384_intern_ckpt_False.py
deleted file mode 100644
index e5d6fa6b..00000000
--- a/configs/13B_train/16384_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_intern_ckpt_True.py b/configs/13B_train/16384_intern_ckpt_True.py
deleted file mode 100644
index 6ac47ac2..00000000
--- a/configs/13B_train/16384_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_megatron_ckpt_False.py b/configs/13B_train/16384_megatron_ckpt_False.py
deleted file mode 100644
index 24429ead..00000000
--- a/configs/13B_train/16384_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_megatron_ckpt_True.py b/configs/13B_train/16384_megatron_ckpt_True.py
deleted file mode 100644
index d79c8207..00000000
--- a/configs/13B_train/16384_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_none_ckpt_False.py b/configs/13B_train/16384_none_ckpt_False.py
deleted file mode 100644
index a30d713a..00000000
--- a/configs/13B_train/16384_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_none_ckpt_True.py b/configs/13B_train/16384_none_ckpt_True.py
deleted file mode 100644
index 76483257..00000000
--- a/configs/13B_train/16384_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_flash-attn_ckpt_False.py b/configs/13B_train/262144_flash-attn_ckpt_False.py
deleted file mode 100644
index fd0be6a7..00000000
--- a/configs/13B_train/262144_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_flash_ckpt_False.py b/configs/13B_train/262144_flash_ckpt_False.py
deleted file mode 100644
index 5ca332ef..00000000
--- a/configs/13B_train/262144_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_flash_ckpt_True.py b/configs/13B_train/262144_flash_ckpt_True.py
deleted file mode 100644
index f990655a..00000000
--- a/configs/13B_train/262144_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_intern_ckpt_False.py b/configs/13B_train/262144_intern_ckpt_False.py
deleted file mode 100644
index 7ebcf94f..00000000
--- a/configs/13B_train/262144_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_intern_ckpt_True.py b/configs/13B_train/262144_intern_ckpt_True.py
deleted file mode 100644
index e958ac06..00000000
--- a/configs/13B_train/262144_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_megatron_ckpt_False.py b/configs/13B_train/262144_megatron_ckpt_False.py
deleted file mode 100644
index 31e96f78..00000000
--- a/configs/13B_train/262144_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_megatron_ckpt_True.py b/configs/13B_train/262144_megatron_ckpt_True.py
deleted file mode 100644
index 2339244b..00000000
--- a/configs/13B_train/262144_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_none_ckpt_False.py b/configs/13B_train/262144_none_ckpt_False.py
deleted file mode 100644
index 41d55e91..00000000
--- a/configs/13B_train/262144_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_none_ckpt_True.py b/configs/13B_train/262144_none_ckpt_True.py
deleted file mode 100644
index 4f2da605..00000000
--- a/configs/13B_train/262144_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash-attn_ckpt_False.py b/configs/13B_train/32768_flash-attn_ckpt_False.py
deleted file mode 100644
index 3eb0f493..00000000
--- a/configs/13B_train/32768_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash-attn_ckpt_True.py b/configs/13B_train/32768_flash-attn_ckpt_True.py
deleted file mode 100644
index 26b06ef3..00000000
--- a/configs/13B_train/32768_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash_ckpt_False.py b/configs/13B_train/32768_flash_ckpt_False.py
deleted file mode 100644
index da30a4dd..00000000
--- a/configs/13B_train/32768_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash_ckpt_True.py b/configs/13B_train/32768_flash_ckpt_True.py
deleted file mode 100644
index 20d415a5..00000000
--- a/configs/13B_train/32768_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_intern_ckpt_False.py b/configs/13B_train/32768_intern_ckpt_False.py
deleted file mode 100644
index 05ab5285..00000000
--- a/configs/13B_train/32768_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_intern_ckpt_True.py b/configs/13B_train/32768_intern_ckpt_True.py
deleted file mode 100644
index 273a812d..00000000
--- a/configs/13B_train/32768_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_megatron_ckpt_False.py b/configs/13B_train/32768_megatron_ckpt_False.py
deleted file mode 100644
index c8db542d..00000000
--- a/configs/13B_train/32768_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_megatron_ckpt_True.py b/configs/13B_train/32768_megatron_ckpt_True.py
deleted file mode 100644
index 9ff56012..00000000
--- a/configs/13B_train/32768_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_none_ckpt_False.py b/configs/13B_train/32768_none_ckpt_False.py
deleted file mode 100644
index a02e0711..00000000
--- a/configs/13B_train/32768_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_none_ckpt_True.py b/configs/13B_train/32768_none_ckpt_True.py
deleted file mode 100644
index b9b17e3c..00000000
--- a/configs/13B_train/32768_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash-attn_ckpt_False.py b/configs/13B_train/4096_flash-attn_ckpt_False.py
deleted file mode 100644
index 8e4459ea..00000000
--- a/configs/13B_train/4096_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash-attn_ckpt_True.py b/configs/13B_train/4096_flash-attn_ckpt_True.py
deleted file mode 100644
index a8f5e39b..00000000
--- a/configs/13B_train/4096_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash_ckpt_False.py b/configs/13B_train/4096_flash_ckpt_False.py
deleted file mode 100644
index 517b46e4..00000000
--- a/configs/13B_train/4096_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash_ckpt_True.py b/configs/13B_train/4096_flash_ckpt_True.py
deleted file mode 100644
index eacfcdfd..00000000
--- a/configs/13B_train/4096_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_intern_ckpt_False.py b/configs/13B_train/4096_intern_ckpt_False.py
deleted file mode 100644
index 5ecf2d66..00000000
--- a/configs/13B_train/4096_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_intern_ckpt_True.py b/configs/13B_train/4096_intern_ckpt_True.py
deleted file mode 100644
index b70acb01..00000000
--- a/configs/13B_train/4096_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_megatron_ckpt_False.py b/configs/13B_train/4096_megatron_ckpt_False.py
deleted file mode 100644
index 2e847a64..00000000
--- a/configs/13B_train/4096_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_megatron_ckpt_True.py b/configs/13B_train/4096_megatron_ckpt_True.py
deleted file mode 100644
index d8ba2c57..00000000
--- a/configs/13B_train/4096_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_none_ckpt_False.py b/configs/13B_train/4096_none_ckpt_False.py
deleted file mode 100644
index f8bbdfc5..00000000
--- a/configs/13B_train/4096_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_none_ckpt_True.py b/configs/13B_train/4096_none_ckpt_True.py
deleted file mode 100644
index d8f8ec7e..00000000
--- a/configs/13B_train/4096_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash-attn_ckpt_False.py b/configs/13B_train/65536_flash-attn_ckpt_False.py
deleted file mode 100644
index 09367f5a..00000000
--- a/configs/13B_train/65536_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash-attn_ckpt_True.py b/configs/13B_train/65536_flash-attn_ckpt_True.py
deleted file mode 100644
index dc283a92..00000000
--- a/configs/13B_train/65536_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash_ckpt_False.py b/configs/13B_train/65536_flash_ckpt_False.py
deleted file mode 100644
index 482d5114..00000000
--- a/configs/13B_train/65536_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash_ckpt_True.py b/configs/13B_train/65536_flash_ckpt_True.py
deleted file mode 100644
index 66051f83..00000000
--- a/configs/13B_train/65536_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_intern_ckpt_False.py b/configs/13B_train/65536_intern_ckpt_False.py
deleted file mode 100644
index f829652a..00000000
--- a/configs/13B_train/65536_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_intern_ckpt_True.py b/configs/13B_train/65536_intern_ckpt_True.py
deleted file mode 100644
index 4e94d0e3..00000000
--- a/configs/13B_train/65536_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_megatron_ckpt_False.py b/configs/13B_train/65536_megatron_ckpt_False.py
deleted file mode 100644
index a9293334..00000000
--- a/configs/13B_train/65536_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_megatron_ckpt_True.py b/configs/13B_train/65536_megatron_ckpt_True.py
deleted file mode 100644
index 845e32bc..00000000
--- a/configs/13B_train/65536_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_none_ckpt_False.py b/configs/13B_train/65536_none_ckpt_False.py
deleted file mode 100644
index 52ce3c52..00000000
--- a/configs/13B_train/65536_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_none_ckpt_True.py b/configs/13B_train/65536_none_ckpt_True.py
deleted file mode 100644
index de5532e1..00000000
--- a/configs/13B_train/65536_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash-attn_ckpt_False.py b/configs/13B_train/8192_flash-attn_ckpt_False.py
deleted file mode 100644
index 3324c290..00000000
--- a/configs/13B_train/8192_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash-attn_ckpt_True.py b/configs/13B_train/8192_flash-attn_ckpt_True.py
deleted file mode 100644
index 317e0f32..00000000
--- a/configs/13B_train/8192_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash_ckpt_False.py b/configs/13B_train/8192_flash_ckpt_False.py
deleted file mode 100644
index d645dc1b..00000000
--- a/configs/13B_train/8192_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash_ckpt_True.py b/configs/13B_train/8192_flash_ckpt_True.py
deleted file mode 100644
index 425859c0..00000000
--- a/configs/13B_train/8192_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_intern_ckpt_False.py b/configs/13B_train/8192_intern_ckpt_False.py
deleted file mode 100644
index 0b4fb8a2..00000000
--- a/configs/13B_train/8192_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_intern_ckpt_True.py b/configs/13B_train/8192_intern_ckpt_True.py
deleted file mode 100644
index b42cb769..00000000
--- a/configs/13B_train/8192_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_megatron_ckpt_False.py b/configs/13B_train/8192_megatron_ckpt_False.py
deleted file mode 100644
index e2191937..00000000
--- a/configs/13B_train/8192_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_megatron_ckpt_True.py b/configs/13B_train/8192_megatron_ckpt_True.py
deleted file mode 100644
index 5123c412..00000000
--- a/configs/13B_train/8192_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_none_ckpt_False.py b/configs/13B_train/8192_none_ckpt_False.py
deleted file mode 100644
index c9d9c050..00000000
--- a/configs/13B_train/8192_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_none_ckpt_True.py b/configs/13B_train/8192_none_ckpt_True.py
deleted file mode 100644
index 182ec21f..00000000
--- a/configs/13B_train/8192_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_template.py b/configs/30B_template.py
index 7a32015e..4ac99bf0 100644
--- a/configs/30B_template.py
+++ b/configs/30B_template.py
@@ -2,7 +2,7 @@
 DO_ALERT = False
 
 SEQ_LEN = {seq_len}
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3
@@ -50,9 +50,9 @@
 data = dict(
     seq_len=SEQ_LEN,
     # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
+    micro_num=1,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
+    micro_bsz=1,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -91,7 +91,7 @@
 hybrid_zero_optimizer = dict(
     # Enable low_level_optimzer overlap_communication
     overlap_sync_grad=True,
-    overlap_sync_param=True,
+    overlap_sync_param=False,
     # bucket size for nccl communication params
     reduce_bucket_size=512 * 1024 * 1024,
     # grad clipping
diff --git a/configs/30B_train/131072_flash_ckpt_False.py b/configs/30B_train/131072_flash_ckpt_False.py
deleted file mode 100644
index 3af48f3e..00000000
--- a/configs/30B_train/131072_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_flash_ckpt_True.py b/configs/30B_train/131072_flash_ckpt_True.py
deleted file mode 100644
index 4bd249bc..00000000
--- a/configs/30B_train/131072_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_intern_ckpt_False.py b/configs/30B_train/131072_intern_ckpt_False.py
deleted file mode 100644
index 77b176d2..00000000
--- a/configs/30B_train/131072_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_intern_ckpt_True.py b/configs/30B_train/131072_intern_ckpt_True.py
deleted file mode 100644
index 38a1db3b..00000000
--- a/configs/30B_train/131072_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_megatron_ckpt_False.py b/configs/30B_train/131072_megatron_ckpt_False.py
deleted file mode 100644
index 49879303..00000000
--- a/configs/30B_train/131072_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_megatron_ckpt_True.py b/configs/30B_train/131072_megatron_ckpt_True.py
deleted file mode 100644
index d911d381..00000000
--- a/configs/30B_train/131072_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_none_ckpt_False.py b/configs/30B_train/131072_none_ckpt_False.py
deleted file mode 100644
index 78b3c9a8..00000000
--- a/configs/30B_train/131072_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_none_ckpt_True.py b/configs/30B_train/131072_none_ckpt_True.py
deleted file mode 100644
index 941279e7..00000000
--- a/configs/30B_train/131072_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_flash_ckpt_False.py b/configs/30B_train/16384_flash_ckpt_False.py
deleted file mode 100644
index 779a10bc..00000000
--- a/configs/30B_train/16384_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_flash_ckpt_True.py b/configs/30B_train/16384_flash_ckpt_True.py
deleted file mode 100644
index 0498e2c4..00000000
--- a/configs/30B_train/16384_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_intern_ckpt_False.py b/configs/30B_train/16384_intern_ckpt_False.py
deleted file mode 100644
index 309a33f0..00000000
--- a/configs/30B_train/16384_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_intern_ckpt_True.py b/configs/30B_train/16384_intern_ckpt_True.py
deleted file mode 100644
index 23c977a5..00000000
--- a/configs/30B_train/16384_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_megatron_ckpt_False.py b/configs/30B_train/16384_megatron_ckpt_False.py
deleted file mode 100644
index 8576aa76..00000000
--- a/configs/30B_train/16384_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_megatron_ckpt_True.py b/configs/30B_train/16384_megatron_ckpt_True.py
deleted file mode 100644
index 460aba3b..00000000
--- a/configs/30B_train/16384_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_none_ckpt_False.py b/configs/30B_train/16384_none_ckpt_False.py
deleted file mode 100644
index 4ca50666..00000000
--- a/configs/30B_train/16384_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_none_ckpt_True.py b/configs/30B_train/16384_none_ckpt_True.py
deleted file mode 100644
index c7987e0d..00000000
--- a/configs/30B_train/16384_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_flash_ckpt_False.py b/configs/30B_train/262144_flash_ckpt_False.py
deleted file mode 100644
index 10d71d9c..00000000
--- a/configs/30B_train/262144_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_flash_ckpt_True.py b/configs/30B_train/262144_flash_ckpt_True.py
deleted file mode 100644
index a1990dbb..00000000
--- a/configs/30B_train/262144_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_intern_ckpt_False.py b/configs/30B_train/262144_intern_ckpt_False.py
deleted file mode 100644
index f8ec6a2f..00000000
--- a/configs/30B_train/262144_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_intern_ckpt_True.py b/configs/30B_train/262144_intern_ckpt_True.py
deleted file mode 100644
index c5afa46b..00000000
--- a/configs/30B_train/262144_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_megatron_ckpt_False.py b/configs/30B_train/262144_megatron_ckpt_False.py
deleted file mode 100644
index 412da179..00000000
--- a/configs/30B_train/262144_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_megatron_ckpt_True.py b/configs/30B_train/262144_megatron_ckpt_True.py
deleted file mode 100644
index 79affb19..00000000
--- a/configs/30B_train/262144_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_none_ckpt_False.py b/configs/30B_train/262144_none_ckpt_False.py
deleted file mode 100644
index e6fbe1eb..00000000
--- a/configs/30B_train/262144_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_none_ckpt_True.py b/configs/30B_train/262144_none_ckpt_True.py
deleted file mode 100644
index d507c30b..00000000
--- a/configs/30B_train/262144_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_flash_ckpt_False.py b/configs/30B_train/32768_flash_ckpt_False.py
deleted file mode 100644
index 6bac5b31..00000000
--- a/configs/30B_train/32768_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_flash_ckpt_True.py b/configs/30B_train/32768_flash_ckpt_True.py
deleted file mode 100644
index f21c9983..00000000
--- a/configs/30B_train/32768_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_intern_ckpt_False.py b/configs/30B_train/32768_intern_ckpt_False.py
deleted file mode 100644
index 79728d64..00000000
--- a/configs/30B_train/32768_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_intern_ckpt_True.py b/configs/30B_train/32768_intern_ckpt_True.py
deleted file mode 100644
index 6dc24c30..00000000
--- a/configs/30B_train/32768_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_megatron_ckpt_False.py b/configs/30B_train/32768_megatron_ckpt_False.py
deleted file mode 100644
index 37fd0986..00000000
--- a/configs/30B_train/32768_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_megatron_ckpt_True.py b/configs/30B_train/32768_megatron_ckpt_True.py
deleted file mode 100644
index 986b27dd..00000000
--- a/configs/30B_train/32768_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_none_ckpt_False.py b/configs/30B_train/32768_none_ckpt_False.py
deleted file mode 100644
index 9c6ca879..00000000
--- a/configs/30B_train/32768_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_none_ckpt_True.py b/configs/30B_train/32768_none_ckpt_True.py
deleted file mode 100644
index d4ab7f2d..00000000
--- a/configs/30B_train/32768_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_flash_ckpt_False.py b/configs/30B_train/4096_flash_ckpt_False.py
deleted file mode 100644
index 3dd8be56..00000000
--- a/configs/30B_train/4096_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_flash_ckpt_True.py b/configs/30B_train/4096_flash_ckpt_True.py
deleted file mode 100644
index 73150acf..00000000
--- a/configs/30B_train/4096_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_intern_ckpt_False.py b/configs/30B_train/4096_intern_ckpt_False.py
deleted file mode 100644
index cff6c5b6..00000000
--- a/configs/30B_train/4096_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_intern_ckpt_True.py b/configs/30B_train/4096_intern_ckpt_True.py
deleted file mode 100644
index 1fb64257..00000000
--- a/configs/30B_train/4096_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_megatron_ckpt_False.py b/configs/30B_train/4096_megatron_ckpt_False.py
deleted file mode 100644
index 79f718d0..00000000
--- a/configs/30B_train/4096_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_megatron_ckpt_True.py b/configs/30B_train/4096_megatron_ckpt_True.py
deleted file mode 100644
index 502ae7f7..00000000
--- a/configs/30B_train/4096_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_none_ckpt_False.py b/configs/30B_train/4096_none_ckpt_False.py
deleted file mode 100644
index 981a0f23..00000000
--- a/configs/30B_train/4096_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_none_ckpt_True.py b/configs/30B_train/4096_none_ckpt_True.py
deleted file mode 100644
index dddea663..00000000
--- a/configs/30B_train/4096_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_flash_ckpt_False.py b/configs/30B_train/65536_flash_ckpt_False.py
deleted file mode 100644
index babebd95..00000000
--- a/configs/30B_train/65536_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_flash_ckpt_True.py b/configs/30B_train/65536_flash_ckpt_True.py
deleted file mode 100644
index 064250e7..00000000
--- a/configs/30B_train/65536_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_intern_ckpt_False.py b/configs/30B_train/65536_intern_ckpt_False.py
deleted file mode 100644
index 64165f44..00000000
--- a/configs/30B_train/65536_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_intern_ckpt_True.py b/configs/30B_train/65536_intern_ckpt_True.py
deleted file mode 100644
index 78b66213..00000000
--- a/configs/30B_train/65536_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_megatron_ckpt_False.py b/configs/30B_train/65536_megatron_ckpt_False.py
deleted file mode 100644
index e8c09548..00000000
--- a/configs/30B_train/65536_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_megatron_ckpt_True.py b/configs/30B_train/65536_megatron_ckpt_True.py
deleted file mode 100644
index d3b64c41..00000000
--- a/configs/30B_train/65536_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_none_ckpt_False.py b/configs/30B_train/65536_none_ckpt_False.py
deleted file mode 100644
index ee4c7fb5..00000000
--- a/configs/30B_train/65536_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_none_ckpt_True.py b/configs/30B_train/65536_none_ckpt_True.py
deleted file mode 100644
index 2e84144c..00000000
--- a/configs/30B_train/65536_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_flash_ckpt_False.py b/configs/30B_train/8192_flash_ckpt_False.py
deleted file mode 100644
index b9eb6e65..00000000
--- a/configs/30B_train/8192_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_flash_ckpt_True.py b/configs/30B_train/8192_flash_ckpt_True.py
deleted file mode 100644
index c0dd5175..00000000
--- a/configs/30B_train/8192_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_intern_ckpt_False.py b/configs/30B_train/8192_intern_ckpt_False.py
deleted file mode 100644
index d915b6b8..00000000
--- a/configs/30B_train/8192_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_intern_ckpt_True.py b/configs/30B_train/8192_intern_ckpt_True.py
deleted file mode 100644
index a71693a1..00000000
--- a/configs/30B_train/8192_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_megatron_ckpt_False.py b/configs/30B_train/8192_megatron_ckpt_False.py
deleted file mode 100644
index dcacb9e5..00000000
--- a/configs/30B_train/8192_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_megatron_ckpt_True.py b/configs/30B_train/8192_megatron_ckpt_True.py
deleted file mode 100644
index b6e4ba24..00000000
--- a/configs/30B_train/8192_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_none_ckpt_False.py b/configs/30B_train/8192_none_ckpt_False.py
deleted file mode 100644
index ce790dfa..00000000
--- a/configs/30B_train/8192_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_none_ckpt_True.py b/configs/30B_train/8192_none_ckpt_True.py
deleted file mode 100644
index e6afcd4e..00000000
--- a/configs/30B_train/8192_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_template.py b/configs/7B_template.py
index b9f76a51..d78fc884 100644
--- a/configs/7B_template.py
+++ b/configs/7B_template.py
@@ -2,7 +2,7 @@
 DO_ALERT = False
 
 SEQ_LEN = {seq_len}
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
diff --git a/configs/7B_train/131072_flash-attn_ckpt_False.py b/configs/7B_train/131072_flash-attn_ckpt_False.py
deleted file mode 100644
index 047fb372..00000000
--- a/configs/7B_train/131072_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_flash-attn_ckpt_True.py b/configs/7B_train/131072_flash-attn_ckpt_True.py
deleted file mode 100644
index 763627d6..00000000
--- a/configs/7B_train/131072_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_flash_ckpt_False.py b/configs/7B_train/131072_flash_ckpt_False.py
deleted file mode 100644
index 4307e9d1..00000000
--- a/configs/7B_train/131072_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_flash_ckpt_True.py b/configs/7B_train/131072_flash_ckpt_True.py
deleted file mode 100644
index c110b256..00000000
--- a/configs/7B_train/131072_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_intern_ckpt_False.py b/configs/7B_train/131072_intern_ckpt_False.py
deleted file mode 100644
index 1d728be7..00000000
--- a/configs/7B_train/131072_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_intern_ckpt_True.py b/configs/7B_train/131072_intern_ckpt_True.py
deleted file mode 100644
index 45d4aa01..00000000
--- a/configs/7B_train/131072_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_megatron_ckpt_False.py b/configs/7B_train/131072_megatron_ckpt_False.py
deleted file mode 100644
index 0bd98459..00000000
--- a/configs/7B_train/131072_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_megatron_ckpt_True.py b/configs/7B_train/131072_megatron_ckpt_True.py
deleted file mode 100644
index 9200afbe..00000000
--- a/configs/7B_train/131072_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_none_ckpt_False.py b/configs/7B_train/131072_none_ckpt_False.py
deleted file mode 100644
index 16059fb1..00000000
--- a/configs/7B_train/131072_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_none_ckpt_True.py b/configs/7B_train/131072_none_ckpt_True.py
deleted file mode 100644
index 35b3f08e..00000000
--- a/configs/7B_train/131072_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash-attn_ckpt_False.py b/configs/7B_train/16384_flash-attn_ckpt_False.py
deleted file mode 100644
index 53a64b99..00000000
--- a/configs/7B_train/16384_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash-attn_ckpt_True.py b/configs/7B_train/16384_flash-attn_ckpt_True.py
deleted file mode 100644
index cdb051e5..00000000
--- a/configs/7B_train/16384_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash_ckpt_False.py b/configs/7B_train/16384_flash_ckpt_False.py
deleted file mode 100644
index 41b39515..00000000
--- a/configs/7B_train/16384_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash_ckpt_True.py b/configs/7B_train/16384_flash_ckpt_True.py
deleted file mode 100644
index ca2c7f06..00000000
--- a/configs/7B_train/16384_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_intern_ckpt_False.py b/configs/7B_train/16384_intern_ckpt_False.py
deleted file mode 100644
index 93abb682..00000000
--- a/configs/7B_train/16384_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_intern_ckpt_True.py b/configs/7B_train/16384_intern_ckpt_True.py
deleted file mode 100644
index af9d9945..00000000
--- a/configs/7B_train/16384_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_megatron_ckpt_False.py b/configs/7B_train/16384_megatron_ckpt_False.py
deleted file mode 100644
index d2c58d3a..00000000
--- a/configs/7B_train/16384_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_megatron_ckpt_True.py b/configs/7B_train/16384_megatron_ckpt_True.py
deleted file mode 100644
index 6e372b8c..00000000
--- a/configs/7B_train/16384_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_none_ckpt_False.py b/configs/7B_train/16384_none_ckpt_False.py
deleted file mode 100644
index 0fd65900..00000000
--- a/configs/7B_train/16384_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_none_ckpt_True.py b/configs/7B_train/16384_none_ckpt_True.py
deleted file mode 100644
index 6ea5e1a9..00000000
--- a/configs/7B_train/16384_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash-attn_ckpt_False.py b/configs/7B_train/262144_flash-attn_ckpt_False.py
deleted file mode 100644
index 6dad9730..00000000
--- a/configs/7B_train/262144_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash-attn_ckpt_True.py b/configs/7B_train/262144_flash-attn_ckpt_True.py
deleted file mode 100644
index cacd9737..00000000
--- a/configs/7B_train/262144_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash_ckpt_False.py b/configs/7B_train/262144_flash_ckpt_False.py
deleted file mode 100644
index 0e9b0173..00000000
--- a/configs/7B_train/262144_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash_ckpt_True.py b/configs/7B_train/262144_flash_ckpt_True.py
deleted file mode 100644
index ddacc8df..00000000
--- a/configs/7B_train/262144_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_intern_ckpt_False.py b/configs/7B_train/262144_intern_ckpt_False.py
deleted file mode 100644
index e5cf7694..00000000
--- a/configs/7B_train/262144_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_intern_ckpt_True.py b/configs/7B_train/262144_intern_ckpt_True.py
deleted file mode 100644
index 76f9386a..00000000
--- a/configs/7B_train/262144_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_megatron_ckpt_False.py b/configs/7B_train/262144_megatron_ckpt_False.py
deleted file mode 100644
index b929f9a6..00000000
--- a/configs/7B_train/262144_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_megatron_ckpt_True.py b/configs/7B_train/262144_megatron_ckpt_True.py
deleted file mode 100644
index 1655631c..00000000
--- a/configs/7B_train/262144_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_none_ckpt_False.py b/configs/7B_train/262144_none_ckpt_False.py
deleted file mode 100644
index 85512f07..00000000
--- a/configs/7B_train/262144_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_none_ckpt_True.py b/configs/7B_train/262144_none_ckpt_True.py
deleted file mode 100644
index fef559bd..00000000
--- a/configs/7B_train/262144_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash-attn_ckpt_False.py b/configs/7B_train/32768_flash-attn_ckpt_False.py
deleted file mode 100644
index f2664be8..00000000
--- a/configs/7B_train/32768_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash-attn_ckpt_True.py b/configs/7B_train/32768_flash-attn_ckpt_True.py
deleted file mode 100644
index 232b5904..00000000
--- a/configs/7B_train/32768_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash_ckpt_False.py b/configs/7B_train/32768_flash_ckpt_False.py
deleted file mode 100644
index 878b9ac1..00000000
--- a/configs/7B_train/32768_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash_ckpt_True.py b/configs/7B_train/32768_flash_ckpt_True.py
deleted file mode 100644
index 27cffd02..00000000
--- a/configs/7B_train/32768_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_intern_ckpt_False.py b/configs/7B_train/32768_intern_ckpt_False.py
deleted file mode 100644
index fcf84197..00000000
--- a/configs/7B_train/32768_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_intern_ckpt_True.py b/configs/7B_train/32768_intern_ckpt_True.py
deleted file mode 100644
index aec2b68b..00000000
--- a/configs/7B_train/32768_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_megatron_ckpt_False.py b/configs/7B_train/32768_megatron_ckpt_False.py
deleted file mode 100644
index 64caeeb5..00000000
--- a/configs/7B_train/32768_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_megatron_ckpt_True.py b/configs/7B_train/32768_megatron_ckpt_True.py
deleted file mode 100644
index a736e7d0..00000000
--- a/configs/7B_train/32768_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_none_ckpt_False.py b/configs/7B_train/32768_none_ckpt_False.py
deleted file mode 100644
index 3a31776e..00000000
--- a/configs/7B_train/32768_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_none_ckpt_True.py b/configs/7B_train/32768_none_ckpt_True.py
deleted file mode 100644
index 4ac09249..00000000
--- a/configs/7B_train/32768_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash-attn_ckpt_False.py b/configs/7B_train/4096_flash-attn_ckpt_False.py
deleted file mode 100644
index b3de8990..00000000
--- a/configs/7B_train/4096_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash-attn_ckpt_True.py b/configs/7B_train/4096_flash-attn_ckpt_True.py
deleted file mode 100644
index b44b103f..00000000
--- a/configs/7B_train/4096_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash_ckpt_False.py b/configs/7B_train/4096_flash_ckpt_False.py
deleted file mode 100644
index 8ac542d6..00000000
--- a/configs/7B_train/4096_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash_ckpt_True.py b/configs/7B_train/4096_flash_ckpt_True.py
deleted file mode 100644
index ec477f68..00000000
--- a/configs/7B_train/4096_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_intern_ckpt_False.py b/configs/7B_train/4096_intern_ckpt_False.py
deleted file mode 100644
index f16f95ad..00000000
--- a/configs/7B_train/4096_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_intern_ckpt_True.py b/configs/7B_train/4096_intern_ckpt_True.py
deleted file mode 100644
index 90fed7c8..00000000
--- a/configs/7B_train/4096_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_megatron_ckpt_False.py b/configs/7B_train/4096_megatron_ckpt_False.py
deleted file mode 100644
index ca41fa28..00000000
--- a/configs/7B_train/4096_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_megatron_ckpt_True.py b/configs/7B_train/4096_megatron_ckpt_True.py
deleted file mode 100644
index 45183156..00000000
--- a/configs/7B_train/4096_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_none_ckpt_False.py b/configs/7B_train/4096_none_ckpt_False.py
deleted file mode 100644
index c81bb5b9..00000000
--- a/configs/7B_train/4096_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_none_ckpt_True.py b/configs/7B_train/4096_none_ckpt_True.py
deleted file mode 100644
index a25d222f..00000000
--- a/configs/7B_train/4096_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash-attn_ckpt_False.py b/configs/7B_train/65536_flash-attn_ckpt_False.py
deleted file mode 100644
index 3d5a81eb..00000000
--- a/configs/7B_train/65536_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash-attn_ckpt_True.py b/configs/7B_train/65536_flash-attn_ckpt_True.py
deleted file mode 100644
index c6982c98..00000000
--- a/configs/7B_train/65536_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash_ckpt_False.py b/configs/7B_train/65536_flash_ckpt_False.py
deleted file mode 100644
index 0cfea813..00000000
--- a/configs/7B_train/65536_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash_ckpt_True.py b/configs/7B_train/65536_flash_ckpt_True.py
deleted file mode 100644
index abdeb49d..00000000
--- a/configs/7B_train/65536_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_intern_ckpt_False.py b/configs/7B_train/65536_intern_ckpt_False.py
deleted file mode 100644
index 2e0b27e1..00000000
--- a/configs/7B_train/65536_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_intern_ckpt_True.py b/configs/7B_train/65536_intern_ckpt_True.py
deleted file mode 100644
index d1a8de7c..00000000
--- a/configs/7B_train/65536_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_megatron_ckpt_False.py b/configs/7B_train/65536_megatron_ckpt_False.py
deleted file mode 100644
index 7de7b92d..00000000
--- a/configs/7B_train/65536_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_megatron_ckpt_True.py b/configs/7B_train/65536_megatron_ckpt_True.py
deleted file mode 100644
index b339c833..00000000
--- a/configs/7B_train/65536_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_none_ckpt_False.py b/configs/7B_train/65536_none_ckpt_False.py
deleted file mode 100644
index b8c44769..00000000
--- a/configs/7B_train/65536_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_none_ckpt_True.py b/configs/7B_train/65536_none_ckpt_True.py
deleted file mode 100644
index b907e437..00000000
--- a/configs/7B_train/65536_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash-attn_ckpt_False.py b/configs/7B_train/8192_flash-attn_ckpt_False.py
deleted file mode 100644
index d0ddd438..00000000
--- a/configs/7B_train/8192_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash-attn_ckpt_True.py b/configs/7B_train/8192_flash-attn_ckpt_True.py
deleted file mode 100644
index d9e5b2f9..00000000
--- a/configs/7B_train/8192_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash_ckpt_False.py b/configs/7B_train/8192_flash_ckpt_False.py
deleted file mode 100644
index 69546d11..00000000
--- a/configs/7B_train/8192_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash_ckpt_True.py b/configs/7B_train/8192_flash_ckpt_True.py
deleted file mode 100644
index 4c7f9864..00000000
--- a/configs/7B_train/8192_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_intern_ckpt_False.py b/configs/7B_train/8192_intern_ckpt_False.py
deleted file mode 100644
index 9694ad81..00000000
--- a/configs/7B_train/8192_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_intern_ckpt_True.py b/configs/7B_train/8192_intern_ckpt_True.py
deleted file mode 100644
index 99a0fc18..00000000
--- a/configs/7B_train/8192_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_megatron_ckpt_False.py b/configs/7B_train/8192_megatron_ckpt_False.py
deleted file mode 100644
index f18ee730..00000000
--- a/configs/7B_train/8192_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_megatron_ckpt_True.py b/configs/7B_train/8192_megatron_ckpt_True.py
deleted file mode 100644
index 1db58412..00000000
--- a/configs/7B_train/8192_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_none_ckpt_False.py b/configs/7B_train/8192_none_ckpt_False.py
deleted file mode 100644
index 95d686bb..00000000
--- a/configs/7B_train/8192_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_none_ckpt_True.py b/configs/7B_train/8192_none_ckpt_True.py
deleted file mode 100644
index a63b6f20..00000000
--- a/configs/7B_train/8192_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/generate.py b/configs/generate.py
index 6a58f098..a8a5898a 100644
--- a/configs/generate.py
+++ b/configs/generate.py
@@ -6,8 +6,8 @@
 root_names = ["7B_train_", "13B_train_", "30B_train_"]
 model_size = ["7B", "13B", "30B"]
 seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144]
-sp = ["none", "megatron", "flash-attn", "intern"]
-intern_overlap = [False, False, False, True]
+sp = ["none", "megatron", "flash-attn", "intern", "intern"]
+intern_overlap = [False, False, False, True, False]
 checkpoint = [False, True]
 
 for idx, root_name in enumerate(root_names):
@@ -32,13 +32,29 @@
                     line = line.replace("{sp}", f"\"{sp_mode}\"")
                     line = line.replace("{intern_overlap}", str(intern_overlap[i]))
                     line = line.replace("{checkpoint}", str(ckpt))
-                    output_file_name = str(seq) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py"
+                    output_file_name = str(seq) + "_" + str(sp_mode) + "_overlap_" + str(intern_overlap[i]) + "_ckpt_" + str(ckpt) + ".py"
                     write_file = folder_path + "/" + output_file_name
                     with open(write_file, "w") as file:
                         file.write(line)
                         
                     log_name = root_name + "_" + output_file_name[:-3]
                     
-                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
+                    skip = True
+                    
+                    if idx == 0 and i == 4:  # 7b, intern_overlap = False
+                        skip = False
+                    if idx == 0 and ckpt is True and i == 3:  # 7b, ckpt = True
+                        skip = False
+                    if idx == 1:  # 13b
+                        skip = False
+                    if idx == 2:  # 30b
+                        skip = False
+                        
+                    if skip:
+                        import time; time.sleep(1)
+                        print(f"skip {log_name}", flush=True)
+                        continue
+                    
+                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
                     process = subprocess.Popen(command, shell=True, executable='/bin/bash')
                     process.wait() 
\ No newline at end of file

From 918dff72579baeb205ed8dc47bce9a2d7aba2c7d Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Wed, 25 Oct 2023 13:47:19 +0800
Subject: [PATCH 055/153] reset moe

---
 internlm/model/moe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/internlm/model/moe.py b/internlm/model/moe.py
index 0865097f..28e5ae6e 100644
--- a/internlm/model/moe.py
+++ b/internlm/model/moe.py
@@ -53,6 +53,7 @@ def __init__(
         device=None,
         dtype=None,
     ):
+
         super().__init__()
 
         assert (

From 363275b500e907cd21e25db4e3bcc54d6acabaf0 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Wed, 25 Oct 2023 14:31:00 +0800
Subject: [PATCH 056/153] add memory print

---
 internlm/model/overlap_handler.py | 3 ++-
 train.py                          | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 35d8a594..d2fef8db 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -316,7 +316,8 @@ def before_forward(self, scheduler, inputs) -> None:
             self._overlap_handler.set_forward_mode(True)
 
     def after_forward(self, scheduler, outputs) -> None:
-        pass
+        print("after forward allocated memory: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True)
+        print("after forward max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
 
     def before_criterion(self, scheduler, outputs, label) -> None:
         pass
diff --git a/train.py b/train.py
index ae867287..e1b8dffd 100644
--- a/train.py
+++ b/train.py
@@ -255,6 +255,8 @@ def main(args):
             # update parameters, and returns (success_update, grad_norm)
             trainer_result = trainer.step()
             assert trainer_result is not None
+            print("after step: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True)
+            print("after step: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
 
             success_update, grad_norm_groups = trainer_result
             if success_update:  # update parameters successfully

From cc20fa271a74bd792476bfb96c3be18660580c1a Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Wed, 25 Oct 2023 16:48:02 +0800
Subject: [PATCH 057/153] reset print memory

---
 internlm/model/overlap_handler.py | 3 +--
 train.py                          | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index d2fef8db..35d8a594 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -316,8 +316,7 @@ def before_forward(self, scheduler, inputs) -> None:
             self._overlap_handler.set_forward_mode(True)
 
     def after_forward(self, scheduler, outputs) -> None:
-        print("after forward allocated memory: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True)
-        print("after forward max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
+        pass
 
     def before_criterion(self, scheduler, outputs, label) -> None:
         pass
diff --git a/train.py b/train.py
index e1b8dffd..ae867287 100644
--- a/train.py
+++ b/train.py
@@ -255,8 +255,6 @@ def main(args):
             # update parameters, and returns (success_update, grad_norm)
             trainer_result = trainer.step()
             assert trainer_result is not None
-            print("after step: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True)
-            print("after step: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
 
             success_update, grad_norm_groups = trainer_result
             if success_update:  # update parameters successfully

From d831ddcc1d44a1ed4c8710a2f383529d31a6dc9d Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Thu, 26 Oct 2023 17:41:17 +0800
Subject: [PATCH 058/153] modify the config

---
 configs/13B_template.py |  4 ++--
 configs/30B_template.py |  6 +++---
 configs/7B_sft.py       |  4 ++--
 configs/generate.py     | 19 ++-----------------
 train.py                |  5 +++--
 5 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/configs/13B_template.py b/configs/13B_template.py
index e0e016cc..849c5aa9 100644
--- a/configs/13B_template.py
+++ b/configs/13B_template.py
@@ -57,7 +57,7 @@
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=False,
+    pack_sample_into_one=True,
     total_steps=20,
     skip_batches="",
     rampup_batch_size="",
@@ -65,7 +65,7 @@
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 
diff --git a/configs/30B_template.py b/configs/30B_template.py
index 4ac99bf0..d19ece6e 100644
--- a/configs/30B_template.py
+++ b/configs/30B_template.py
@@ -2,7 +2,7 @@
 DO_ALERT = False
 
 SEQ_LEN = {seq_len}
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
+JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3
@@ -57,7 +57,7 @@
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=False,
+    pack_sample_into_one=True,
     total_steps=20,
     skip_batches="",
     rampup_batch_size="",
@@ -65,7 +65,7 @@
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 4f482656..2d6a3bee 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -57,7 +57,7 @@
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=True,
-    total_steps=20,
+    total_steps=50,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/configs/generate.py b/configs/generate.py
index a8a5898a..038998c7 100644
--- a/configs/generate.py
+++ b/configs/generate.py
@@ -39,22 +39,7 @@
                         
                     log_name = root_name + "_" + output_file_name[:-3]
                     
-                    skip = True
-                    
-                    if idx == 0 and i == 4:  # 7b, intern_overlap = False
-                        skip = False
-                    if idx == 0 and ckpt is True and i == 3:  # 7b, ckpt = True
-                        skip = False
-                    if idx == 1:  # 13b
-                        skip = False
-                    if idx == 2:  # 30b
-                        skip = False
-                        
-                    if skip:
-                        import time; time.sleep(1)
-                        print(f"skip {log_name}", flush=True)
-                        continue
-                    
-                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
+                    print(log_name)
+                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
                     process = subprocess.Popen(command, shell=True, executable='/bin/bash')
                     process.wait() 
\ No newline at end of file
diff --git a/train.py b/train.py
index ae867287..f4195964 100644
--- a/train.py
+++ b/train.py
@@ -309,8 +309,9 @@ def main(args):
 
             if memory_profiler is not None:
                 memory_profiler.step()
-
-            prof.step()
+            
+            if batch_count % 2 == 0:
+                prof.step()
 
             if gpc.fstp_handler is not None:
                 gpc.fstp_handler.clear_memory_pool()

From cbd4f042447ec71ee9523fd0ad646d3c074848cf Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Thu, 26 Oct 2023 20:04:01 +0800
Subject: [PATCH 059/153] add synchronize

---
 internlm/solver/optimizer/hybrid_zero_optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 6f983f3e..19a79bfd 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -856,7 +856,7 @@ def broadcast_params(self):
         for handle in handles:
             handle.wait()
 
-        torch.cuda().synchronize()
+        torch.cuda.synchronize()
 
     ##################
     # FP16 Utilities #

From 3253cbf48ef23c7e67e340533c16e1a372579f8e Mon Sep 17 00:00:00 2001
From: mwiacx <759046501@qq.com>
Date: Thu, 26 Oct 2023 20:21:46 +0800
Subject: [PATCH 060/153] add a new get_tflops_func

---
 internlm/utils/common.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/internlm/utils/common.py b/internlm/utils/common.py
index f3b58c0c..188a634d 100644
--- a/internlm/utils/common.py
+++ b/internlm/utils/common.py
@@ -220,6 +220,43 @@ def get_megatron_flops(
     return tflops
 
 
+def get_megatron_flops_2(
+    elapsed_time_per_iter,
+    checkpoint=False,
+    seq_len=2048,
+    hidden_size=12,
+    num_layers=32,
+    vocab_size=12,
+    global_batch_size=4,
+    global_world_size=1,
+    mlp_ratio=4,
+    use_swiglu=True,
+):
+    """
+    Calc flops based on the paper of Megatron https://deepakn94.github.io/assets/papers/megatron-sc21.pdf
+    """
+
+    checkpoint_activations_factor = 4 if checkpoint else 3
+    flashattn_activations_factor = 4.5 if checkpoint else 3.5
+
+    if use_swiglu:
+        mlp_ratio = mlp_ratio * 3 / 2
+
+    flops_per_iteration = (
+        checkpoint_activations_factor
+        * (8 + mlp_ratio * 4)
+        * global_batch_size
+        * seq_len
+        * hidden_size**2
+        * num_layers
+        + 4 * global_batch_size * seq_len**2 * hidden_size * num_layers * flashattn_activations_factor
+        + 6 * global_batch_size * seq_len * hidden_size * vocab_size
+    )
+
+    tflops = flops_per_iteration / (elapsed_time_per_iter * global_world_size * (10**12))
+    return tflops
+
+
 class DummyProfile:
     """
     Dummy Profile.

From 8aefb74e02d6083d308a15b4d90309a24e1a093b Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Thu, 26 Oct 2023 20:33:12 +0800
Subject: [PATCH 061/153] add flash tflops

---
 internlm/train/training_internlm.py | 13 +++++++++++++
 train.py                            | 14 ++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index df3fa88d..a4b2e598 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -406,11 +406,13 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
 
 tgs_list = []
 tflops_list = []
+tflops_list_2 = []
 
 
 @llm_timeout(func_name="record_current_batch_training_metrics")
 def record_current_batch_training_metrics(
     get_tflops_func,
+    get_tflops_func_2,
     logger,
     writer,
     success_update,
@@ -495,6 +497,7 @@ def record_current_batch_training_metrics(
         tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2)
 
         tflops = get_tflops_func((time.time() - start_time))
+        tflops_2 = get_tflops_func_2((time.time() - start_time))
 
         tgs_origin = round(
             num_tokens_in_batch
@@ -506,6 +509,7 @@ def record_current_batch_training_metrics(
 
         infos = {
             "tflops": tflops,
+            "tflops2": tflops_2,
             "step": batch_count,
             "loss": loss.item() - moe_loss.item() if moe_loss is not None else loss.item(),
             "tgs (tokens/gpu/second)": tgs_origin,
@@ -599,6 +603,7 @@ def record_current_batch_training_metrics(
         if batch_count >= 5:
             tgs_list.append(tgs_origin)
             tflops_list.append(tflops)
+            tflops_list_2.append(tflops_2)
         if batch_count == gpc.config.data.total_steps - 1:
             print(tgs_list, flush=True)
             avg_tgs = sum(tgs_list) / len(tgs_list)
@@ -606,9 +611,17 @@ def record_current_batch_training_metrics(
                 if abs(tgs - avg_tgs) > 400:
                     tgs_list.remove(tgs)
             print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True)
+
             print(tflops_list, flush=True)
             avg_tflops = sum(tflops_list) / len(tflops_list)
             for tf in tflops_list.copy():
                 if abs(tf - avg_tflops) > 10:
                     tflops_list.remove(tf)
             print(f"avg_tflops: {sum(tflops_list)/len(tflops_list)}", flush=True)
+
+            print(tflops_list_2, flush=True)
+            avg_tflops_2 = sum(tflops_list_2) / len(tflops_list_2)
+            for tf in tflops_list_2.copy():
+                if abs(tf - avg_tflops_2) > 10:
+                    tflops_list_2.remove(tf)
+            print(f"avg_tflops: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True)
diff --git a/train.py b/train.py
index f4195964..45117623 100644
--- a/train.py
+++ b/train.py
@@ -33,6 +33,7 @@
 from internlm.utils.common import (
     BatchSkipper,
     get_megatron_flops,
+    get_megatron_flops_2,
     launch_time,
     parse_args,
 )
@@ -111,6 +112,18 @@ def main(args):
         global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
         mlp_ratio=gpc.config.MLP_RATIO,
     )
+    
+    get_tflops_func_2 = partial(
+        get_megatron_flops_2,
+        checkpoint=gpc.config.model.checkpoint,
+        seq_len=gpc.config.SEQ_LEN,
+        hidden_size=gpc.config.model.hidden_size,
+        num_layers=gpc.config.model.num_layers,
+        vocab_size=gpc.config.model.vocab_size,
+        global_batch_size=gpc.config.data.micro_bsz * gpc.config.data.micro_num * gpc.get_world_size(ParallelMode.DATA),
+        global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
+        mlp_ratio=gpc.config.MLP_RATIO,
+    )
 
     # get and broadcast current time
     current_time = launch_time()
@@ -271,6 +284,7 @@ def main(args):
             # calculate and record the training metrics, eg. loss, accuracy and so on.
             record_current_batch_training_metrics(
                 get_tflops_func=get_tflops_func,
+                get_tflops_func_2=get_tflops_func_2,
                 logger=logger,
                 writer=writer,
                 success_update=success_update,

From aa3840fc3853185b03213fe43459f19d5fb80d53 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Thu, 26 Oct 2023 20:42:24 +0800
Subject: [PATCH 062/153] fix some bugs

---
 internlm/train/training_internlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index a4b2e598..2b806926 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -624,4 +624,4 @@ def record_current_batch_training_metrics(
             for tf in tflops_list_2.copy():
                 if abs(tf - avg_tflops_2) > 10:
                     tflops_list_2.remove(tf)
-            print(f"avg_tflops: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True)
+            print(f"avg_tflops_2: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True)

From 3778c666603601c66c558870d43835d5bd88bbba Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 27 Oct 2023 20:04:23 +0800
Subject: [PATCH 063/153] feat(model/overlap_handler.py): fix overlap hander to
 support pp(non-interleaved)

---
 internlm/model/overlap_handler.py | 40 ++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 35d8a594..8462def4 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -75,6 +75,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                                         if child.bias is not None:
                                             setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
 
+        self.num_blocks = len(self.index_to_fstp_modules)
+
         self._initialize_memory_pool()
         self._register_sync_parameters_hook()
 
@@ -219,15 +221,25 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):  # pylint: d
                     self._all_gather_block_weight_memory_pool(block_index - 1)
             else:
                 # start the all-gather for next block
-                if block_index + 1 < gpc.config.NUM_LAYER:
+                if block_index + 1 < self.num_blocks:
                     self._all_gather_block_weight_memory_pool(block_index + 1)
 
         def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):  # pylint: disable=W0613
-            handle = self.fstp_global_handle[module]
-            handle.wait()
-            if module.bias is not None:
-                bias_handle = self.bias_global_handle[module]
-                bias_handle.wait()
+            if module in self.fstp_global_handle:
+                handle = self.fstp_global_handle[module]
+                handle.wait()
+                if module.bias is not None:
+                    bias_handle = self.bias_global_handle[module]
+                    bias_handle.wait()
+            else:
+                weight_handle = all_gather_raw_memory_pool(
+                    module.weight,
+                    self.process_group,
+                    async_op=True,
+                    module=module,
+                )
+                self.fstp_global_handle[module] = weight_handle
+                weight_handle.wait()
 
         def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
             if module in self.fstp_global_handle:
@@ -245,12 +257,22 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  #
 
         def _pre_backward_hook_for_head(module: nn.Module, grad_output):
             if self.is_forward is False:
-                self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1)
+                self._all_gather_block_weight_memory_pool(self.num_blocks - 1)
 
         def _pre_backward_hook_for_module(module: nn.Module, grad_output):  # pylint: disable=W0613
             # wait handle for current module
-            weight_handle = self.fstp_global_handle[module]
-            weight_handle.wait()
+            if module in self.fstp_global_handle:
+                weight_handle = self.fstp_global_handle[module]
+                weight_handle.wait()
+            else:
+                weight_handle = all_gather_raw_memory_pool(
+                    module.weight,
+                    self.process_group,
+                    async_op=True,
+                    module=module,
+                )
+                self.fstp_global_handle[module] = weight_handle
+                weight_handle.wait()
 
             # start the all-gather for next module
             module_index = self.fstp_modules.index(module)

From 4c1cd5d49ba65fa903183bb1c6759a5e3f5f8b4b Mon Sep 17 00:00:00 2001
From: mwiacx <759046501@qq.com>
Date: Tue, 31 Oct 2023 19:39:24 +0800
Subject: [PATCH 064/153] fix async reduce scatter

---
 internlm/model/overlap_handler.py             |  11 +-
 .../solver/optimizer/hybrid_zero_optim.py     | 136 ++++++++++--------
 train.py                                      |  17 +--
 3 files changed, 88 insertions(+), 76 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 8462def4..7805e111 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -328,13 +328,12 @@ class FSTPOverlapSchedulerHook(SchedulerHook):
     SchedulerHook for fstp overlap handler
     """
 
-    def __init__(self, overlap_handler: FSTPOverlapHandler) -> None:
-        super().__init__()
-
+    def __init__(self, overlap_handler: FSTPOverlapHandler, zero_optim) -> None:
         self._overlap_handler = overlap_handler
+        self._zero_optim = zero_optim
 
     def before_forward(self, scheduler, inputs) -> None:
-        if self._overlap_handler is not None:
+        if self._overlap_handler.model_checkpoint:
             self._overlap_handler.set_forward_mode(True)
 
     def after_forward(self, scheduler, outputs) -> None:
@@ -347,11 +346,11 @@ def after_criterion(self, scheduler, loss) -> None:
         pass
 
     def before_backward(self, scheduler, outputs, outputs_grad) -> None:
-        if self._overlap_handler is not None:
+        if self._overlap_handler.model_checkpoint:
             self._overlap_handler.set_forward_mode(False)
 
     def after_backward(self, scheduler, inputs_grad) -> None:
-        pass
+        self._zero_optim.accumulate_left_grads_after_backward()
 
     def post_helper_func(self, scheduler, outputs, label) -> None:
         pass
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 19a79bfd..2d04bc64 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -66,10 +66,6 @@ def __init__(
         hysteresis = grad_scal_cfg.hysteresis
         max_scale = grad_scal_cfg.max_scale
 
-        self._fstp_handler = None
-        if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
-            self._fstp_handler = gpc.fstp_handler
-
         # Zero related args
         reduce_bucket_size = zero_cfg.reduce_bucket_size
         clip_grad_norm = zero_cfg.clip_grad_norm
@@ -133,6 +129,12 @@ def __init__(
         if self._overlap_sync_param:
             assert self._param_bcast_sync_handler is not None
 
+        if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
+            self._fstp_handler = gpc.fstp_handler
+        else:
+            self._fstp_handler = None
+        self._accum_grad_buckets: List[BucketStore] = []
+
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
         # and add buffers to parameter store for future access
@@ -221,8 +223,7 @@ def __init__(
 
         # reduction hook is only used if overlapping communication
         # if it is stage 1 without overlapping, no hook will be attached
-        if self._overlap_sync_grad:
-            self._attach_reduction_hook()
+        self._attach_reduction_hook()
 
     @property
     def zero_local_rank(self):
@@ -289,60 +290,79 @@ def _attach_reduction_hook(self):
             param_group = self._fp16_param_groups[group_id]
             for param in param_group:
                 # we should not reduce the param in moe
-                if param.requires_grad:
-                    reduce_rank = None
-
-                    def _define_and_attach(param, reduce_rank=None):
-                        # get the AccumulateGrad object of the param itself
-                        # If these objects are not kept, reduction hooks may not be attached successfully.
-                        accum_grad_obj = get_grad_accumulate_object(param)
-                        self._grad_store.add_accumulate_grad_object(accum_grad_obj)
-
-                        reduction_func = partial(
-                            self._store_and_try_reduce_grads_by_bucket,
-                            param=param,
-                            reduce_rank=reduce_rank,
-                        )
+                if not param.requires_grad:
+                    continue
 
-                        reduce_scatter_checker = partial(
-                            self._wait_reduce_scatter_and_accumulate_grads,
-                            param=param,
-                            reduce_rank=reduce_rank,
-                        )
-                        def reduction_sp_func():
-                            handle = reduce_tensor(
-                                param.grad,
-                                dtype=None,
-                                dst_rank=reduce_rank,
-                                parallel_mode=ParallelMode.TENSOR,
-                            )
-                            handle.wait()
-
-                        # define hook
-                        # NOT IMPORTANT BUT GOOD TO KNOW:
-                        # args here is not grad, but allow_unreacable and accumulate_grad
-                        def reduce_grad_hook(*args):  # pylint: disable=W0613
-                            if self._fstp_handler is not None:
-                                reduce_scatter_checker()
-
-                            if self.skip_grad_reduce is False:
-                                reduction_func()
-
-                        # define hook for sequence_parallel
-                        def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
-                            if self.skip_grad_reduce is False:
-                                reduction_sp_func()
-
-                        # if sequence_parallel is True,
-                        # the grad of norm should be all-reduce across the tp process group
-                        if gpc.config.parallel.sequence_parallel is True:
-                            if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True:
-                                accum_grad_obj_sp = get_grad_accumulate_object(param)
-                                accum_grad_obj_sp.register_hook(reduce_grad_hook_sp)
+                reduce_rank = None
 
+                def _define_and_attach(param, reduce_rank=None):
+                    reduction_func = partial(
+                        self._store_and_try_reduce_grads_by_bucket,
+                        param=param,
+                        reduce_rank=reduce_rank,
+                    )
+
+                    reduce_scatter_checker = partial(
+                        self._wait_reduce_scatter_and_accumulate_grads,
+                        param=param,
+                        reduce_rank=reduce_rank,
+                    )
+
+                    def reduction_sp_func():
+                        handle = reduce_tensor(
+                            param.grad,
+                            dtype=None,
+                            dst_rank=reduce_rank,
+                            parallel_mode=ParallelMode.TENSOR,
+                        )
+                        handle.wait()
+
+                    # define hook
+                    # NOT IMPORTANT BUT GOOD TO KNOW:
+                    # args here is not grad, but allow_unreacable and accumulate_grad
+                    def reduce_grad_hook(*args):  # pylint: disable=W0613
+                        if self.skip_grad_reduce is False:
+                            reduction_func()
+
+                    # define hook for real gradient accumulation.
+                    def accum_grad_hook(*args):  # pylint: disable=W0613
+                        reduce_scatter_checker()
+
+                    # define hook for sequence_parallel
+                    def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
+                        if self.skip_grad_reduce is False:
+                            reduction_sp_func()
+
+                    # get the AccumulateGrad object of the param itself
+                    # If these objects are not kept, reduction hooks may not be attached successfully.
+                    accum_grad_obj = get_grad_accumulate_object(param)
+                    self._grad_store.add_accumulate_grad_object(accum_grad_obj)
+
+                    # if sequence_parallel is True,
+                    # the grad of norm should be all-reduce across the tp process group
+                    if (
+                        gpc.config.parallel.sequence_parallel is True
+                        and hasattr(param, IS_SEQUENCE_PARALLEL)
+                        and getattr(param, IS_SEQUENCE_PARALLEL) is True
+                    ):
+                        accum_grad_obj.register_hook(reduce_grad_hook_sp)
+
+                    # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
+                    # we must keep up with reduce_grad_hook.
+                    if self._fstp_handler is not None:
+                        accum_grad_obj.register_hook(accum_grad_hook)
+
+                    if self._overlap_sync_grad:
                         accum_grad_obj.register_hook(reduce_grad_hook)
 
-                    _define_and_attach(param, reduce_rank)
+                _define_and_attach(param, reduce_rank)
+
+    def accumulate_left_grads_after_backward(self):
+        if self._fstp_handler is None:
+            return
+
+        for group_id in range(self.num_param_groups):
+            self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id])
 
     def belongs_to_current_rank(self, param) -> bool:
         """
@@ -633,10 +653,6 @@ def step(self, closure=None):
                     if param.grad is not None:
                         self._store_and_try_reduce_grads_by_bucket(param)
 
-        # we need to accumulate gradients left in the accumulate gardient bucket
-        for group_id in range(self.num_param_groups):
-            self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id], reduce_rank=None)
-
         # we need to reduce the gradients left in the communication bucket
         for group_id in range(self.num_param_groups):
             self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
diff --git a/train.py b/train.py
index 45117623..644bbebc 100644
--- a/train.py
+++ b/train.py
@@ -5,7 +5,7 @@
 import time
 import traceback
 from functools import partial
-from typing import List, Optional
+from typing import List
 
 import torch
 import torch.distributed as dist
@@ -70,9 +70,7 @@ def initialize_llm_logger(start_time: str):
     return uniscale_logger
 
 
-def get_scheduler_hooks(
-    metric: Optional[AccPerplex] = None, activation_checkpoint: bool = False
-) -> List[SchedulerHook]:
+def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]:
     scheduler_hooks: List[SchedulerHook] = []
 
     if metric is not None:
@@ -87,9 +85,8 @@ def get_scheduler_hooks(
                 ),
             ),
         )
-
-    if activation_checkpoint:
-        scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler))
+    if gpc.fstp_handler is not None:
+        scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler, zero_optim))
 
     return scheduler_hooks
 
@@ -112,7 +109,7 @@ def main(args):
         global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
         mlp_ratio=gpc.config.MLP_RATIO,
     )
-    
+
     get_tflops_func_2 = partial(
         get_megatron_flops_2,
         checkpoint=gpc.config.model.checkpoint,
@@ -196,7 +193,7 @@ def main(args):
         train_dataloader=train_dl,
         lr_scheduler=lr_scheduler,
         beta2_scheduler=beta2_scheduler,
-        scheduler_hooks=get_scheduler_hooks(metric, gpc.config.model.checkpoint),
+        scheduler_hooks=get_scheduler_hooks(metric, optimizer),
     )
 
     # initialize simple memory profiler
@@ -323,7 +320,7 @@ def main(args):
 
             if memory_profiler is not None:
                 memory_profiler.step()
-            
+
             if batch_count % 2 == 0:
                 prof.step()
 

From 6b843253eb7ef6829daa966ff06de0889c664b1c Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 31 Oct 2023 20:26:36 +0800
Subject: [PATCH 065/153] fix(optimizer/hybrid_zero_optim.py): remove redundant
 _accum_grad_buckets

---
 internlm/solver/optimizer/hybrid_zero_optim.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 2d04bc64..0ab63960 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -133,7 +133,6 @@ def __init__(
             self._fstp_handler = gpc.fstp_handler
         else:
             self._fstp_handler = None
-        self._accum_grad_buckets: List[BucketStore] = []
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training

From b3def4c1628dbba652ffb9b089eeb7be9de584af Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 31 Oct 2023 20:40:58 +0800
Subject: [PATCH 066/153] fix(optimizer/hybrid_zero_optim.py): add
 reduce_scatter_overlap switch

---
 configs/7B_sft.py                              | 4 ++--
 internlm/model/overlap_handler.py              | 9 +++++----
 internlm/model/utils.py                        | 4 ++--
 internlm/solver/optimizer/hybrid_zero_optim.py | 5 +++--
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 2d6a3bee..b34a838b 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -57,7 +57,7 @@
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=True,
-    total_steps=50,
+    total_steps=10,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
+    tensor=dict(size=8, sp="intern", intern_overlap=True, reduce_scatter_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 7805e111..418c4aa7 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -70,10 +70,11 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
 
                                         setattr(child, "_fstp_name", name)
 
-                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
-                                        if child.bias is not None:
-                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
+                                        if gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False):
+                                            _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
+                                            setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
+                                            if child.bias is not None:
+                                                setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
 
         self.num_blocks = len(self.index_to_fstp_modules)
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 982c0e08..63dd09d7 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -568,7 +568,7 @@ def backward(ctx, grad_output, *args):
                 total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
             )
             if world_size > 1:
-                if overlap_handler is not None:
+                if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False):
                     grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
                         grad_weight, process_group, async_op=True
                     )
@@ -621,7 +621,7 @@ def backward(ctx, grad_output, *args):
         del total_weight
 
         if ctx.needs_input_grad[1]:
-            if world_size > 1 and overlap_handler is None:
+            if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)):
                 handle_grad_weight.wait()
                 if grad_bias is not None:
                     handle_grad_bias.wait()
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 0ab63960..a8b524ac 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -133,6 +133,7 @@ def __init__(
             self._fstp_handler = gpc.fstp_handler
         else:
             self._fstp_handler = None
+        self._reduce_scatter_overlap = gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
@@ -348,7 +349,7 @@ def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
 
                     # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
                     # we must keep up with reduce_grad_hook.
-                    if self._fstp_handler is not None:
+                    if self._fstp_handler is not None and self._reduce_scatter_overlap is True:
                         accum_grad_obj.register_hook(accum_grad_hook)
 
                     if self._overlap_sync_grad:
@@ -357,7 +358,7 @@ def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
                 _define_and_attach(param, reduce_rank)
 
     def accumulate_left_grads_after_backward(self):
-        if self._fstp_handler is None:
+        if self._fstp_handler is None or self._reduce_scatter_overlap is False:
             return
 
         for group_id in range(self.num_param_groups):

From 10b5056e1ebfe540f1008c97f4b3bcdafe8b22da Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Wed, 1 Nov 2023 12:31:52 +0800
Subject: [PATCH 067/153] fix all-gather overlap the model_checkpoint is 0

---
 configs/7B_sft.py                   | 8 ++++----
 internlm/model/overlap_handler.py   | 2 +-
 internlm/train/training_internlm.py | 6 +++++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index b34a838b..99285085 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -1,7 +1,7 @@
 JOB_NAME = "7b_train"
 DO_ALERT = False
 
-SEQ_LEN = 4096
+SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
@@ -49,9 +49,9 @@
 data = dict(
     seq_len=SEQ_LEN,
     # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
+    micro_num=4,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
+    micro_bsz=2,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True, reduce_scatter_overlap=True),
+    tensor=dict(size=4, sp="intern", intern_overlap=True, reduce_scatter_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 418c4aa7..db811504 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -315,7 +315,7 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
         # 1. register post_backward_hook @head module to prefetch for the last block's last module
         # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module
         # 3. register post_backward_hook @fstp_module to release resource
-        if self.model_checkpoint is False:
+        if not self.model_checkpoint:
             for head in self.head:
                 head.register_full_backward_hook(_post_backward_hook_for_head)
 
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2b806926..2b5a1bb4 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -407,6 +407,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
 tgs_list = []
 tflops_list = []
 tflops_list_2 = []
+loss_list = []
 
 
 @llm_timeout(func_name="record_current_batch_training_metrics")
@@ -599,11 +600,12 @@ def record_current_batch_training_metrics(
             step_count=batch_count,
             cur_step_loss=loss.item(),
         )
-
+        loss_list.append(loss.item())
         if batch_count >= 5:
             tgs_list.append(tgs_origin)
             tflops_list.append(tflops)
             tflops_list_2.append(tflops_2)
+            
         if batch_count == gpc.config.data.total_steps - 1:
             print(tgs_list, flush=True)
             avg_tgs = sum(tgs_list) / len(tgs_list)
@@ -625,3 +627,5 @@ def record_current_batch_training_metrics(
                 if abs(tf - avg_tflops_2) > 10:
                     tflops_list_2.remove(tf)
             print(f"avg_tflops_2: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True)
+            
+            print("loss: ", loss_list, flush=True)

From 48512913567ba88b3280ba660e0c3b5ac60cef55 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 2 Nov 2023 10:30:16 +0800
Subject: [PATCH 068/153] fix(optimizer/hybrid_zero_optim.py): fix bucket size
 full judge condition when reduce scatter overlap

---
 internlm/solver/optimizer/hybrid_zero_optim.py | 2 +-
 internlm/train/training_internlm.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index a8b524ac..1472aa85 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -404,7 +404,7 @@ def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional
         # check if the bucket is full
         # if full, will reduce the grads already in the bucket
         # after reduction, the bucket will be empty
-        if current_bucket.num_elements_in_bucket(reduce_rank) >= self._reduce_bucket_size:
+        if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
             self._accum_grads_store_in_bucket(current_bucket, reduce_rank)
 
         # otherwise, add the parameter into bucket.
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2b5a1bb4..a05f62df 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -600,12 +600,12 @@ def record_current_batch_training_metrics(
             step_count=batch_count,
             cur_step_loss=loss.item(),
         )
+
         loss_list.append(loss.item())
         if batch_count >= 5:
             tgs_list.append(tgs_origin)
             tflops_list.append(tflops)
             tflops_list_2.append(tflops_2)
-            
         if batch_count == gpc.config.data.total_steps - 1:
             print(tgs_list, flush=True)
             avg_tgs = sum(tgs_list) / len(tgs_list)
@@ -627,5 +627,5 @@ def record_current_batch_training_metrics(
                 if abs(tf - avg_tflops_2) > 10:
                     tflops_list_2.remove(tf)
             print(f"avg_tflops_2: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True)
-            
+
             print("loss: ", loss_list, flush=True)

From 5a18b3b6510ab1f79065c6c0e67ecaa0e581a1af Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 2 Nov 2023 16:05:07 +0800
Subject: [PATCH 069/153] fix(model/overlap_handler.py): fix last block hook
 when pp with activation

---
 internlm/model/overlap_handler.py | 64 ++++++++++++++++---------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index db811504..ed0a8d22 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -6,6 +6,7 @@
 import torch
 from torch import nn
 
+from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.scheduler import SchedulerHook
@@ -32,6 +33,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.bias_global_handle = dict()  # key: fstp module; value: module bias global all-gather op handle
         self.module_to_index = dict()  # key: fstp module; value: transformer block index
         self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
+        self.last_block = None
         self.head = []
         self.embedding = []
         self.model_checkpoint = gpc.config.model.checkpoint
@@ -54,6 +56,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                 elif isinstance(children, Embedding1D):
                     self.embedding.append(children)
                 elif isinstance(children, nn.ModuleList):
+                    self.last_block = children[len(children) - 1]
                     for idx, block in enumerate(children):
                         self.index_to_fstp_modules[idx] = []
                         for _sub_name, sub in block.named_children():
@@ -150,39 +153,23 @@ def get_bias_memory(self, module: nn.Module):
         return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
 
     def get_reduce_scatter_memory(self, key):
-        return_idx = 0
-
         # if key not in dict
         if key not in self.reduce_scatter_memory_pool:
             self.reduce_scatter_memory_pool[key] = []
 
-        # if the data is empty
-        if len(self.reduce_scatter_memory_pool[key]) == 0:
-            self.reduce_scatter_memory_pool[key].append(
-                torch.zeros(
-                    key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()
-                ).contiguous()
-            )
-            setattr(self.reduce_scatter_memory_pool[key][return_idx], "idle", False)
-            setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx)
-            return self.reduce_scatter_memory_pool[key][return_idx]
-        else:  # if not empty
-            for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]):
-                if mem_item.idle is True:
-                    self.reduce_scatter_memory_pool[key][index].idle = False
-                    return_idx = index
-                    return self.reduce_scatter_memory_pool[key][return_idx]
-            # if the memory pool is all used
-            cur_len = len(self.reduce_scatter_memory_pool[key])
-            self.reduce_scatter_memory_pool[key].append(
-                torch.zeros(
-                    key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()
-                ).contiguous()
-            )
-            setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False)
-            return_idx = cur_len
-            setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx)
-            return self.reduce_scatter_memory_pool[key][return_idx]
+        for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]):
+            if mem_item.idle is True:
+                self.reduce_scatter_memory_pool[key][index].idle = False
+                return self.reduce_scatter_memory_pool[key][index]
+
+        # if the memory pool is all used
+        cur_len = len(self.reduce_scatter_memory_pool[key])
+        self.reduce_scatter_memory_pool[key].append(
+            torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous()
+        )
+        setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False)
+        setattr(self.reduce_scatter_memory_pool[key][cur_len], "index", cur_len)
+        return self.reduce_scatter_memory_pool[key][cur_len]
 
     def release_reduce_scatter_memory(self, key, index):
         self.reduce_scatter_memory_pool[key][index].idle = True
@@ -242,6 +229,18 @@ def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):  # pylint: dis
                 self.fstp_global_handle[module] = weight_handle
                 weight_handle.wait()
 
+        def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):  # pylint: disable=W0613
+            fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1]
+            if module in fstp_modules:
+                weight_handle = all_gather_raw_memory_pool(
+                    module.weight,
+                    self.process_group,
+                    async_op=True,
+                    module=module,
+                )
+                self.fstp_global_handle[module] = weight_handle
+                weight_handle.wait()
+
         def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
             if module in self.fstp_global_handle:
                 del self.fstp_global_handle[module]
@@ -301,8 +300,11 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
             embedding.register_forward_hook(_post_forward_hook_for_embedding)
 
         if self.model_checkpoint:
-            for head in self.head:
-                head.register_full_backward_pre_hook(_pre_backward_hook_for_head)
+            if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE):
+                for head in self.head:
+                    head.register_full_backward_pre_hook(_pre_backward_hook_for_head)
+            else:
+                self.last_block.register_forward_pre_hook(_pre_forward_hook_for_block)
 
         for out_proj in self.fstp_outs:
             out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)

From 9b1265c59107edd44063684c96446af20892fd25 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 6 Nov 2023 10:45:08 +0800
Subject: [PATCH 070/153] modify the sp allreduce and support tf32 for fstp
 linear

---
 .gitignore                                    |   2 +
 configs/generate.py                           |   8 ++
 internlm/model/utils.py                       | 102 +++++++++++++++++-
 .../solver/optimizer/hybrid_zero_optim.py     |  43 ++++----
 4 files changed, 129 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9bdc7ec7..ef18a4a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -152,6 +152,8 @@ fstp_logs/
 configs/7B_train/*
 configs/13B_train/*
 configs/30B_train/*
+configs/test_loss/*
+loss_tensorboard/*
 
 atb
 pip
diff --git a/configs/generate.py b/configs/generate.py
index 038998c7..5f044e72 100644
--- a/configs/generate.py
+++ b/configs/generate.py
@@ -39,6 +39,14 @@
                         
                     log_name = root_name + "_" + output_file_name[:-3]
                     
+                    skip = True
+                    
+                    if sp_mode == "intern" and intern_overlap[i] is True:
+                        skip = False
+                    
+                    if skip:
+                        continue
+                    
                     print(log_name)
                     command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
                     process = subprocess.Popen(command, shell=True, executable='/bin/bash')
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 63dd09d7..4f197b1f 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -627,6 +627,104 @@ def backward(ctx, grad_output, *args):
                     handle_grad_bias.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
 
+class FSTPFusedDenseFuncTorch(FSTPFusedDenseFunc):
+    "FusedDenseFunc for FSTP, which is optimized based on flash implementation."
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        overlap_handler = ctx.overlap_handler
+        module = ctx.module
+
+        if ctx.compute_weight_gradient:
+            x, weight, bias = ctx.saved_tensors
+            total_x = x
+        else:
+            weight, bias = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+
+        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        if world_size > 1:
+            if overlap_handler is not None:
+                total_weight = gpc.fstp_handler.get_all_gather_memory(module=module)
+            else:
+                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
+                handle_weight.wait()
+        else:
+            total_weight = weight
+
+        # compute weight grad
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            grad_weight, grad_bias = linear_bias_wgrad_torch(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+            if world_size > 1:
+                if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False):
+                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
+                        grad_weight, process_group, async_op=True
+                    )
+                    assert hasattr(weight, "_fstp_reduce_scatter_str")
+                    overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (
+                        handle_grad_weight,
+                        grad_weight_async,
+                    )
+                    grad_weight = overlap_handler.get_zero_by_shape(
+                        (
+                            grad_weight.shape[0] // torch.distributed.get_world_size(process_group),
+                            *grad_weight.shape[1:],
+                        ),
+                        dtype=grad_weight.dtype,
+                        device=grad_weight.device,
+                    )
+                    if grad_bias is not None:
+                        grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(
+                            grad_bias, process_group, async_op=True
+                        )
+                        assert hasattr(bias, "_fstp_reduce_scatter_str")
+                        overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (
+                            handle_grad_bias,
+                            grad_bias_async,
+                        )
+                        grad_bias = overlap_handler.get_zero_by_shape(
+                            (
+                                grad_bias.shape[0] // torch.distributed.get_world_size(process_group),
+                                *grad_bias.shape[1:],
+                            ),
+                            dtype=grad_bias.dtype,
+                            device=grad_bias.device,
+                        )
+                else:
+                    grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                    if grad_bias is not None:
+                        grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, total_weight.t())
+            else:
+                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight)
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+        else:
+            grad_input = None
+        del total_weight
+
+        if ctx.needs_input_grad[1]:
+            if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)):
+                handle_grad_weight.wait()
+                if grad_bias is not None:
+                    handle_grad_bias.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
 
 def fused_dense_func_torch(
     x: Tensor,
@@ -683,9 +781,7 @@ def fstp_fused_dense_func(
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
         return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler)
     else:
-        assert process_group is None
-        out = F.linear(x, weight, bias)
-        return out if not return_residual else (out, x)
+        return FSTPFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, module, handler)
 
 
 def try_import_RMSNorm():
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 1472aa85..b2b16dcc 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -308,15 +308,6 @@ def _define_and_attach(param, reduce_rank=None):
                         reduce_rank=reduce_rank,
                     )
 
-                    def reduction_sp_func():
-                        handle = reduce_tensor(
-                            param.grad,
-                            dtype=None,
-                            dst_rank=reduce_rank,
-                            parallel_mode=ParallelMode.TENSOR,
-                        )
-                        handle.wait()
-
                     # define hook
                     # NOT IMPORTANT BUT GOOD TO KNOW:
                     # args here is not grad, but allow_unreacable and accumulate_grad
@@ -328,25 +319,11 @@ def reduce_grad_hook(*args):  # pylint: disable=W0613
                     def accum_grad_hook(*args):  # pylint: disable=W0613
                         reduce_scatter_checker()
 
-                    # define hook for sequence_parallel
-                    def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
-                        if self.skip_grad_reduce is False:
-                            reduction_sp_func()
-
                     # get the AccumulateGrad object of the param itself
                     # If these objects are not kept, reduction hooks may not be attached successfully.
                     accum_grad_obj = get_grad_accumulate_object(param)
                     self._grad_store.add_accumulate_grad_object(accum_grad_obj)
 
-                    # if sequence_parallel is True,
-                    # the grad of norm should be all-reduce across the tp process group
-                    if (
-                        gpc.config.parallel.sequence_parallel is True
-                        and hasattr(param, IS_SEQUENCE_PARALLEL)
-                        and getattr(param, IS_SEQUENCE_PARALLEL) is True
-                    ):
-                        accum_grad_obj.register_hook(reduce_grad_hook_sp)
-
                     # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
                     # we must keep up with reduce_grad_hook.
                     if self._fstp_handler is not None and self._reduce_scatter_overlap is True:
@@ -644,6 +621,26 @@ def step(self, closure=None):
         """
         assert closure is None, "closure is not supported by step()"
 
+        # do all-reduce for layernorm when sequence_parallel is True
+        if gpc.config.parallel.sequence_parallel is True:
+            for group_id in range(len(self._fp16_param_groups)):
+                norm_bucket = TensorBucket(size=0)
+                for param in self._fp16_param_groups[group_id]:
+                    if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True:
+                        norm_bucket.add_to_bucket(param.grad, allow_oversize=True)
+                # import pdb; pdb.set_trace()
+                if not norm_bucket.is_empty():
+                    norm_bucket.flatten()
+                    norm_bucket.commu_handle = reduce_tensor(
+                                            tensor=norm_bucket.get_flat_tensor(),
+                                            dtype=None,
+                                            dst_rank=None,
+                                            parallel_mode=ParallelMode.TENSOR,
+                                        )
+                    norm_bucket.commu_handle.wait()
+                    norm_bucket.unflatten_and_copy()
+                    # norm_bucket.empty()
+
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients
         if not self._overlap_sync_grad:

From c517ec5b8cdf9c675f97dcc615bfd39c2ffda010 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 6 Nov 2023 11:57:14 +0800
Subject: [PATCH 071/153] feat(model/overlap_handler.py): delete
 reduce_scatter_overlap switch

---
 configs/7B_sft.py                              |  2 +-
 internlm/model/overlap_handler.py              |  9 ++++-----
 internlm/model/utils.py                        | 11 +++++++----
 internlm/solver/optimizer/hybrid_zero_optim.py | 17 +++++++----------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 99285085..e85d2df8 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=4, sp="intern", intern_overlap=True, reduce_scatter_overlap=True),
+    tensor=dict(size=4, sp="intern", intern_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index ed0a8d22..e3198bb7 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -73,11 +73,10 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
 
                                         setattr(child, "_fstp_name", name)
 
-                                        if gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False):
-                                            _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                            setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
-                                            if child.bias is not None:
-                                                setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
+                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
+                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
+                                        if child.bias is not None:
+                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
 
         self.num_blocks = len(self.index_to_fstp_modules)
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 4f197b1f..556752aa 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -568,7 +568,7 @@ def backward(ctx, grad_output, *args):
                 total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
             )
             if world_size > 1:
-                if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False):
+                if overlap_handler is not None:
                     grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
                         grad_weight, process_group, async_op=True
                     )
@@ -621,14 +621,16 @@ def backward(ctx, grad_output, *args):
         del total_weight
 
         if ctx.needs_input_grad[1]:
-            if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)):
+            if world_size > 1 and overlap_handler is None:
                 handle_grad_weight.wait()
                 if grad_bias is not None:
                     handle_grad_bias.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
 
+
 class FSTPFusedDenseFuncTorch(FSTPFusedDenseFunc):
     "FusedDenseFunc for FSTP, which is optimized based on flash implementation."
+
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output, *args):
@@ -667,7 +669,7 @@ def backward(ctx, grad_output, *args):
                 total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
             )
             if world_size > 1:
-                if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False):
+                if overlap_handler is not None:
                     grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
                         grad_weight, process_group, async_op=True
                     )
@@ -720,12 +722,13 @@ def backward(ctx, grad_output, *args):
         del total_weight
 
         if ctx.needs_input_grad[1]:
-            if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)):
+            if world_size > 1 and overlap_handler is None:
                 handle_grad_weight.wait()
                 if grad_bias is not None:
                     handle_grad_bias.wait()
         return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
 
+
 def fused_dense_func_torch(
     x: Tensor,
     weight: Tensor,
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index b2b16dcc..9a277ae4 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -133,7 +133,6 @@ def __init__(
             self._fstp_handler = gpc.fstp_handler
         else:
             self._fstp_handler = None
-        self._reduce_scatter_overlap = gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
@@ -326,7 +325,7 @@ def accum_grad_hook(*args):  # pylint: disable=W0613
 
                     # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
                     # we must keep up with reduce_grad_hook.
-                    if self._fstp_handler is not None and self._reduce_scatter_overlap is True:
+                    if self._fstp_handler is not None:
                         accum_grad_obj.register_hook(accum_grad_hook)
 
                     if self._overlap_sync_grad:
@@ -335,7 +334,7 @@ def accum_grad_hook(*args):  # pylint: disable=W0613
                 _define_and_attach(param, reduce_rank)
 
     def accumulate_left_grads_after_backward(self):
-        if self._fstp_handler is None or self._reduce_scatter_overlap is False:
+        if self._fstp_handler is None:
             return
 
         for group_id in range(self.num_param_groups):
@@ -628,18 +627,16 @@ def step(self, closure=None):
                 for param in self._fp16_param_groups[group_id]:
                     if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True:
                         norm_bucket.add_to_bucket(param.grad, allow_oversize=True)
-                # import pdb; pdb.set_trace()
                 if not norm_bucket.is_empty():
                     norm_bucket.flatten()
                     norm_bucket.commu_handle = reduce_tensor(
-                                            tensor=norm_bucket.get_flat_tensor(),
-                                            dtype=None,
-                                            dst_rank=None,
-                                            parallel_mode=ParallelMode.TENSOR,
-                                        )
+                        tensor=norm_bucket.get_flat_tensor(),
+                        dtype=None,
+                        dst_rank=None,
+                        parallel_mode=ParallelMode.TENSOR,
+                    )
                     norm_bucket.commu_handle.wait()
                     norm_bucket.unflatten_and_copy()
-                    # norm_bucket.empty()
 
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients

From 7c6d2936b352775443948010a9cfb9ba06080e85 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 6 Nov 2023 12:04:01 +0800
Subject: [PATCH 072/153] reset the sp allreduce in optimizer

---
 .../solver/optimizer/hybrid_zero_optim.py     | 43 ++++++++++---------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index b2b16dcc..1472aa85 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -308,6 +308,15 @@ def _define_and_attach(param, reduce_rank=None):
                         reduce_rank=reduce_rank,
                     )
 
+                    def reduction_sp_func():
+                        handle = reduce_tensor(
+                            param.grad,
+                            dtype=None,
+                            dst_rank=reduce_rank,
+                            parallel_mode=ParallelMode.TENSOR,
+                        )
+                        handle.wait()
+
                     # define hook
                     # NOT IMPORTANT BUT GOOD TO KNOW:
                     # args here is not grad, but allow_unreacable and accumulate_grad
@@ -319,11 +328,25 @@ def reduce_grad_hook(*args):  # pylint: disable=W0613
                     def accum_grad_hook(*args):  # pylint: disable=W0613
                         reduce_scatter_checker()
 
+                    # define hook for sequence_parallel
+                    def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
+                        if self.skip_grad_reduce is False:
+                            reduction_sp_func()
+
                     # get the AccumulateGrad object of the param itself
                     # If these objects are not kept, reduction hooks may not be attached successfully.
                     accum_grad_obj = get_grad_accumulate_object(param)
                     self._grad_store.add_accumulate_grad_object(accum_grad_obj)
 
+                    # if sequence_parallel is True,
+                    # the grad of norm should be all-reduce across the tp process group
+                    if (
+                        gpc.config.parallel.sequence_parallel is True
+                        and hasattr(param, IS_SEQUENCE_PARALLEL)
+                        and getattr(param, IS_SEQUENCE_PARALLEL) is True
+                    ):
+                        accum_grad_obj.register_hook(reduce_grad_hook_sp)
+
                     # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
                     # we must keep up with reduce_grad_hook.
                     if self._fstp_handler is not None and self._reduce_scatter_overlap is True:
@@ -621,26 +644,6 @@ def step(self, closure=None):
         """
         assert closure is None, "closure is not supported by step()"
 
-        # do all-reduce for layernorm when sequence_parallel is True
-        if gpc.config.parallel.sequence_parallel is True:
-            for group_id in range(len(self._fp16_param_groups)):
-                norm_bucket = TensorBucket(size=0)
-                for param in self._fp16_param_groups[group_id]:
-                    if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True:
-                        norm_bucket.add_to_bucket(param.grad, allow_oversize=True)
-                # import pdb; pdb.set_trace()
-                if not norm_bucket.is_empty():
-                    norm_bucket.flatten()
-                    norm_bucket.commu_handle = reduce_tensor(
-                                            tensor=norm_bucket.get_flat_tensor(),
-                                            dtype=None,
-                                            dst_rank=None,
-                                            parallel_mode=ParallelMode.TENSOR,
-                                        )
-                    norm_bucket.commu_handle.wait()
-                    norm_bucket.unflatten_and_copy()
-                    # norm_bucket.empty()
-
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients
         if not self._overlap_sync_grad:

From b5e4d04a9a410aec027a1273eae2d3687ae27834 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 6 Nov 2023 12:08:31 +0800
Subject: [PATCH 073/153] fix conflicts

---
 .../solver/optimizer/hybrid_zero_optim.py     | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index e5927e65..b033539d 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -643,27 +643,6 @@ def step(self, closure=None):
         """
         assert closure is None, "closure is not supported by step()"
 
-<<<<<<< HEAD
-=======
-        # do all-reduce for layernorm when sequence_parallel is True
-        if gpc.config.parallel.sequence_parallel is True:
-            for group_id in range(len(self._fp16_param_groups)):
-                norm_bucket = TensorBucket(size=0)
-                for param in self._fp16_param_groups[group_id]:
-                    if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True:
-                        norm_bucket.add_to_bucket(param.grad, allow_oversize=True)
-                if not norm_bucket.is_empty():
-                    norm_bucket.flatten()
-                    norm_bucket.commu_handle = reduce_tensor(
-                        tensor=norm_bucket.get_flat_tensor(),
-                        dtype=None,
-                        dst_rank=None,
-                        parallel_mode=ParallelMode.TENSOR,
-                    )
-                    norm_bucket.commu_handle.wait()
-                    norm_bucket.unflatten_and_copy()
-
->>>>>>> c517ec5b8cdf9c675f97dcc615bfd39c2ffda010
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients
         if not self._overlap_sync_grad:

From 74754397df336db3c9fd03fb297792f8c4b546d8 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 13 Nov 2023 21:09:59 +0800
Subject: [PATCH 074/153] feat(model/overlap_handler.py): add memory_pool
 switch and refactor overlap handler

---
 configs/7B_sft.py                             |   2 +-
 internlm/model/overlap_handler.py             | 197 ++++++++++--------
 internlm/model/utils.py                       |  23 +-
 .../solver/optimizer/hybrid_zero_optim.py     |   4 +-
 train.py                                      |   2 +-
 5 files changed, 130 insertions(+), 98 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index e85d2df8..63fa67e4 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=4, sp="intern", intern_overlap=True),
+    tensor=dict(size=4, sp="intern", intern_overlap=True, memory_pool=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index e3198bb7..cb00d229 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -13,6 +13,7 @@
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
 from internlm.model.utils import (
+    all_gather_raw,
     all_gather_raw_bias_memory_pool,
     all_gather_raw_memory_pool,
 )
@@ -29,14 +30,17 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.fstp_outs = []
         self.fstp_modules = []
         self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
-        self.fstp_global_handle = dict()  # key: fstp module; value: module global all-gather op handle
+        self.weight_global_handle = dict()  # key: fstp module; value: module global all-gather op handle
         self.bias_global_handle = dict()  # key: fstp module; value: module bias global all-gather op handle
+        self.weight_global_output = dict()  # key: fstp module; value: module global weight after all-gather op
+        self.bias_global_output = dict()  # key: fstp module; value: module bias global weight after all-gather op
         self.module_to_index = dict()  # key: fstp module; value: transformer block index
         self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
         self.last_block = None
         self.head = []
         self.embedding = []
         self.model_checkpoint = gpc.config.model.checkpoint
+        self.enable_memory_pool = gpc.config.parallel["tensor"].get("memory_pool", False)
         self.is_forward = True
 
         self.reduce_scatter_handlers = {}
@@ -60,34 +64,36 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                     for idx, block in enumerate(children):
                         self.index_to_fstp_modules[idx] = []
                         for _sub_name, sub in block.named_children():
-                            sub_modules = list(sub.children())
-                            if len(sub_modules) > 0:
-                                for name, child in sub.named_children():
-                                    if name == "out_proj":
-                                        self.fstp_outs.append(child)
-                                        self.module_to_index[child] = idx
-                                    if isinstance(child, FSTPLinear):
-                                        self.module_to_index[child] = idx
-                                        self.fstp_modules.append(child)
-                                        self.index_to_fstp_modules[idx].append(child)
-
-                                        setattr(child, "_fstp_name", name)
-
-                                        _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                        setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
-                                        if child.bias is not None:
-                                            setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
+                            for name, child in sub.named_children():
+                                if name == "out_proj":
+                                    self.fstp_outs.append(child)
+                                    self.module_to_index[child] = idx
+                                if isinstance(child, FSTPLinear):
+                                    self.module_to_index[child] = idx
+                                    self.fstp_modules.append(child)
+                                    self.index_to_fstp_modules[idx].append(child)
+
+                                    setattr(child, "_fstp_name", name)
+
+                                    _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
+                                    setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
+                                    if child.bias is not None:
+                                        setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
 
         self.num_blocks = len(self.index_to_fstp_modules)
 
-        self._initialize_memory_pool()
+        if self.enable_memory_pool:
+            self._initialize_memory_pool()
         self._register_sync_parameters_hook()
 
     def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
-        if size not in self.zero_const_pool:
-            self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
+        if self.enable_memory_pool:
+            if size not in self.zero_const_pool:
+                self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
 
-        return self.zero_const_pool[size]
+            return self.zero_const_pool[size]
+        else:
+            return torch.zeros(*size, dtype=dtype, device=device).contiguous()
 
     def set_forward_mode(self, flag):
         self.is_forward = flag
@@ -122,14 +128,20 @@ def _initialize_memory_pool(self) -> None:
             self.all_gather_memory_pool.append(weight)  # containing two groups of block weight
 
     def clear_memory_pool(self) -> None:
+        assert self.enable_memory_pool
+
         self.zero_const_pool = {}
         self.reduce_scatter_memory_pool = {}
 
-    def get_all_gather_memory(self, module):
+    def _get_weight_from_memory_pool(self, module):
+        assert self.enable_memory_pool
+
         block_index = self.module_to_index[module]
         return self.all_gather_memory_pool[block_index % 2][module._fstp_name]
 
-    def get_bias_memory(self, module: nn.Module):
+    def _get_bias_from_memory_pool(self, module: nn.Module):
+        assert self.enable_memory_pool
+
         block_index = self.module_to_index[module]
         # if the bias memory pool is empty or module has been not allocated memory
         if len(self.all_gather_bias_memory_pool) == 0:
@@ -151,7 +163,21 @@ def get_bias_memory(self, module: nn.Module):
 
         return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
 
+    def get_weight_all_gather(self, module):
+        if self.enable_memory_pool:
+            return self._get_weight_from_memory_pool(module)
+        else:
+            return self.weight_global_output[module]
+
+    def get_bias_all_gather(self, module):
+        if self.enable_memory_pool:
+            return self._get_bias_from_memory_pool(module)
+        else:
+            return self.bias_global_output[module]
+
     def get_reduce_scatter_memory(self, key):
+        assert self.enable_memory_pool
+
         # if key not in dict
         if key not in self.reduce_scatter_memory_pool:
             self.reduce_scatter_memory_pool[key] = []
@@ -171,11 +197,11 @@ def get_reduce_scatter_memory(self, key):
         return self.reduce_scatter_memory_pool[key][cur_len]
 
     def release_reduce_scatter_memory(self, key, index):
+        assert self.enable_memory_pool
         self.reduce_scatter_memory_pool[key][index].idle = True
 
-    def _all_gather_block_weight_memory_pool(self, block_index: int):
-        fstp_modules = self.index_to_fstp_modules[block_index]
-        for module in fstp_modules:
+    def _all_gather_module_weight(self, module):
+        if self.enable_memory_pool:
             if module.bias is not None:
                 bias_handle = all_gather_raw_bias_memory_pool(
                     module.bias,
@@ -191,103 +217,102 @@ def _all_gather_block_weight_memory_pool(self, block_index: int):
                 async_op=True,
                 module=module,
             )
-            self.fstp_global_handle[module] = weight_handle
+            self.weight_global_handle[module] = weight_handle
+        else:
+            if module.bias is not None:
+                bias_output, bias_handle = all_gather_raw(
+                    module.bias,
+                    self.process_group,
+                    async_op=True,
+                )
+                self.bias_global_handle[module] = bias_handle
+                self.bias_global_output[module] = bias_output
+
+            weight_output, weight_handle = all_gather_raw(
+                module.weight,
+                self.process_group,
+                async_op=True,
+            )
+            self.weight_global_handle[module] = weight_handle
+            self.weight_global_output[module] = weight_output
+
+    def _all_gather_block_weight(self, block_index: int):
+        fstp_modules = self.index_to_fstp_modules[block_index]
+        for module in fstp_modules:
+            self._all_gather_module_weight(module)
 
     def _register_sync_parameters_hook(self) -> None:
         """
         register forward hooks and backward hooks for fstp modules.
         """
 
+        def _wait_handle(module):
+            handle = self.weight_global_handle[module]
+            handle.wait()
+            if module.bias is not None:
+                bias_handle = self.bias_global_handle[module]
+                bias_handle.wait()
+
+        def _clear_handle(module):
+            if module in self.weight_global_handle:
+                del self.weight_global_handle[module]
+            if module in self.bias_global_handle:
+                del self.bias_global_handle[module]
+            # if module in self.weight_global_output:
+            #     del self.weight_global_output[module]
+            # if module in self.bias_global_output:
+            #     del self.bias_global_output[module]
+
         def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
-            self._all_gather_block_weight_memory_pool(0)
+            self._all_gather_block_weight(0)
 
         def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):  # pylint: disable=W0613
             block_index = self.module_to_index[module]
             if self.model_checkpoint and self.is_forward is False:
                 if block_index - 1 >= 0:
-                    self._all_gather_block_weight_memory_pool(block_index - 1)
+                    self._all_gather_block_weight(block_index - 1)
             else:
                 # start the all-gather for next block
                 if block_index + 1 < self.num_blocks:
-                    self._all_gather_block_weight_memory_pool(block_index + 1)
+                    self._all_gather_block_weight(block_index + 1)
 
         def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):  # pylint: disable=W0613
-            if module in self.fstp_global_handle:
-                handle = self.fstp_global_handle[module]
-                handle.wait()
-                if module.bias is not None:
-                    bias_handle = self.bias_global_handle[module]
-                    bias_handle.wait()
-            else:
-                weight_handle = all_gather_raw_memory_pool(
-                    module.weight,
-                    self.process_group,
-                    async_op=True,
-                    module=module,
-                )
-                self.fstp_global_handle[module] = weight_handle
-                weight_handle.wait()
+            if module not in self.weight_global_handle:
+                self._all_gather_module_weight(module)
+
+            _wait_handle(module)
 
         def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):  # pylint: disable=W0613
             fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1]
             if module in fstp_modules:
-                weight_handle = all_gather_raw_memory_pool(
-                    module.weight,
-                    self.process_group,
-                    async_op=True,
-                    module=module,
-                )
-                self.fstp_global_handle[module] = weight_handle
-                weight_handle.wait()
+                self._all_gather_module_weight(module)
+                _wait_handle(module)
 
         def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
-            if module in self.fstp_global_handle:
-                del self.fstp_global_handle[module]
+            _clear_handle(module)
 
         def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  # pylint: disable=W0613
-            first_backward_module = self.fstp_modules[-1]
-            weight_handle = all_gather_raw_memory_pool(
-                first_backward_module.weight,
-                self.process_group,
-                async_op=True,
-                module=first_backward_module,
-            )
-            self.fstp_global_handle[first_backward_module] = weight_handle
+            self._all_gather_module_weight(self.fstp_modules[-1])
 
         def _pre_backward_hook_for_head(module: nn.Module, grad_output):
             if self.is_forward is False:
-                self._all_gather_block_weight_memory_pool(self.num_blocks - 1)
+                self._all_gather_block_weight(self.num_blocks - 1)
 
         def _pre_backward_hook_for_module(module: nn.Module, grad_output):  # pylint: disable=W0613
             # wait handle for current module
-            if module in self.fstp_global_handle:
-                weight_handle = self.fstp_global_handle[module]
-                weight_handle.wait()
-            else:
-                weight_handle = all_gather_raw_memory_pool(
-                    module.weight,
-                    self.process_group,
-                    async_op=True,
-                    module=module,
-                )
-                self.fstp_global_handle[module] = weight_handle
-                weight_handle.wait()
+            if module not in self.weight_global_handle:
+                self._all_gather_module_weight(module)
+
+            _wait_handle(module)
 
             # start the all-gather for next module
             module_index = self.fstp_modules.index(module)
             if module_index - 1 >= 0:
                 next_module = self.fstp_modules[module_index - 1]
-                weight_handle = all_gather_raw_memory_pool(
-                    next_module.weight,
-                    self.process_group,
-                    async_op=True,
-                    module=next_module,
-                )
-                self.fstp_global_handle[next_module] = weight_handle
+                self._all_gather_module_weight(next_module)
 
         def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint: disable=W0613
-            if module in self.fstp_global_handle:
-                del self.fstp_global_handle[module]
+            _clear_handle(module)
 
         # register forward hooks
         # 1. register post_forward_hook @embedding module to prefetch for block 0
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 556752aa..45d2f51a 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -132,7 +132,7 @@ def all_gather_raw_memory_pool(
     module: nn.Module = None,
 ):
     handle = torch.distributed.all_gather_into_tensor(
-        gpc.fstp_handler.get_all_gather_memory(module=module),
+        gpc.fstp_handler.get_weight_all_gather(module=module),
         input_.contiguous(),
         group=process_group,
         async_op=async_op,
@@ -147,7 +147,7 @@ def all_gather_raw_bias_memory_pool(
     module: nn.Module = None,
 ):
     handle = torch.distributed.all_gather_into_tensor(
-        gpc.fstp_handler.get_bias_memory(module=module),
+        gpc.fstp_handler.get_bias_all_gather(module=module),
         input_.contiguous(),
         group=process_group,
         async_op=async_op,
@@ -177,8 +177,13 @@ def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bo
 def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
-    size = (input_.shape[0] // world_size, *input_.shape[1:])
-    output = gpc.fstp_handler.get_reduce_scatter_memory(size)
+    if gpc.fstp_handler.enable_memory_pool:
+        size = (input_.shape[0] // world_size, *input_.shape[1:])
+        output = gpc.fstp_handler.get_reduce_scatter_memory(size)
+    else:
+        output = torch.empty(
+            input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
+        ).contiguous()
     handle = torch.distributed.reduce_scatter_tensor(
         output, input_.contiguous(), group=process_group, async_op=async_op
     )
@@ -493,14 +498,14 @@ def forward(
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
             if overlap_handler is not None:
-                total_weight = gpc.fstp_handler.get_all_gather_memory(module=module)
+                total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
-            # TODO memory pool for bias
+
             if bias is not None:
                 if overlap_handler is not None:
-                    total_bias = gpc.fstp_handler.get_bias_memory(module=module)
+                    total_bias = gpc.fstp_handler.get_bias_all_gather(module=module)
                 else:
                     total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
                     handle_bias.wait()
@@ -554,7 +559,7 @@ def backward(ctx, grad_output, *args):
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             if overlap_handler is not None:
-                total_weight = gpc.fstp_handler.get_all_gather_memory(module=module)
+                total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
@@ -655,7 +660,7 @@ def backward(ctx, grad_output, *args):
         world_size = gpc.get_world_size(ParallelMode.TENSOR)
         if world_size > 1:
             if overlap_handler is not None:
-                total_weight = gpc.fstp_handler.get_all_gather_memory(module=module)
+                total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)
             else:
                 total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
                 handle_weight.wait()
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index b033539d..3092a625 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -389,7 +389,9 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona
             _param.grad.add_(_grad)
 
             # release cuda memory.
-            self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
+            if self._fstp_handler.enable_memory_pool:
+                self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
+            _grad = None
             self._fstp_handler.reduce_scatter_handlers[_key] = None
 
         bucket.reset_by_rank(reduce_rank)
diff --git a/train.py b/train.py
index 644bbebc..5ea91e8c 100644
--- a/train.py
+++ b/train.py
@@ -324,7 +324,7 @@ def main(args):
             if batch_count % 2 == 0:
                 prof.step()
 
-            if gpc.fstp_handler is not None:
+            if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool:
                 gpc.fstp_handler.clear_memory_pool()
             # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()

From 3c07423151924f7350d8e7f7b93d8150721c61df Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 14 Nov 2023 11:30:26 +0800
Subject: [PATCH 075/153] feat(model/overlap_handler.py): release weight

---
 internlm/model/overlap_handler.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index cb00d229..715fa467 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -258,10 +258,12 @@ def _clear_handle(module):
                 del self.weight_global_handle[module]
             if module in self.bias_global_handle:
                 del self.bias_global_handle[module]
-            # if module in self.weight_global_output:
-            #     del self.weight_global_output[module]
-            # if module in self.bias_global_output:
-            #     del self.bias_global_output[module]
+
+        def _clear_weight(module):
+            if module in self.weight_global_output:
+                del self.weight_global_output[module]
+            if module in self.bias_global_output:
+                del self.bias_global_output[module]
 
         def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
             self._all_gather_block_weight(0)
@@ -290,6 +292,8 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):  # pylint: disab
 
         def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
             _clear_handle(module)
+            if not self.model_checkpoint:
+                _clear_weight(module)
 
         def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  # pylint: disable=W0613
             self._all_gather_module_weight(self.fstp_modules[-1])
@@ -313,6 +317,7 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output):  # pylint: di
 
         def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint: disable=W0613
             _clear_handle(module)
+            _clear_weight(module)
 
         # register forward hooks
         # 1. register post_forward_hook @embedding module to prefetch for block 0

From a1fd8778288b0ab76d20fa39290c8fe62cd5e654 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 15 Nov 2023 14:40:06 +0800
Subject: [PATCH 076/153] fix(train.py): clear memory pool before optim step

---
 train.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index 5ea91e8c..789094ac 100644
--- a/train.py
+++ b/train.py
@@ -220,7 +220,7 @@ def main(args):
         # start iterating the train data and begin training
         for batch_count in range(train_state.batch_count, total_steps):
             empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval)
-            # torch.cuda.memory._record_memory_history()
+            torch.cuda.memory._record_memory_history()
             start_time = time.time()
             timer("one-batch").start()
 
@@ -262,6 +262,9 @@ def main(args):
                 )
             timer("fwd-bwd").stop()
 
+            if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool:
+                gpc.fstp_handler.clear_memory_pool()
+
             # update parameters, and returns (success_update, grad_norm)
             trainer_result = trainer.step()
             assert trainer_result is not None
@@ -324,9 +327,7 @@ def main(args):
             if batch_count % 2 == 0:
                 prof.step()
 
-            if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool:
-                gpc.fstp_handler.clear_memory_pool()
-            # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 
     ckpt_manager.wait_async_upload_finish()
@@ -353,3 +354,5 @@ def main(args):
             mm.monitor_exception(
                 alert_address=gpc.config.monitor.alert.feishu_alert_address, excp_info=traceback.format_exc()
             )
+
+            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")

From a80fcf8628bcfde37b65e09899679c814224a4e3 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 28 Nov 2023 19:33:55 +0800
Subject: [PATCH 077/153] feat(model): refactor weight and os and data patition
 strategy

---
 configs/7B_sft.py                             |   6 +-
 internlm/core/context/__init__.py             |   1 +
 internlm/core/context/parallel_context.py     |  39 +-
 .../core/context/process_group_initializer.py | 378 +++++--
 internlm/initialize/launch.py                 |   7 +
 internlm/model/embedding.py                   |  56 +-
 internlm/model/linear.py                      |  23 +
 internlm/model/modeling_internlm.py           |  31 +-
 internlm/model/multi_head_attention.py        |  13 +-
 internlm/model/overlap_handler.py             |  28 +-
 internlm/model/utils.py                       |   4 +-
 .../solver/optimizer/hybrid_zero_optim2.py    | 983 ++++++++++++++++++
 internlm/solver/optimizer/utils.py            |  14 +-
 internlm/train/training_internlm.py           |   6 +-
 internlm/utils/parallel.py                    |  12 +-
 train.py                                      |   6 +-
 16 files changed, 1498 insertions(+), 109 deletions(-)
 create mode 100644 internlm/solver/optimizer/hybrid_zero_optim2.py

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 63fa67e4..3c491660 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -162,9 +162,11 @@
         defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=4, sp="intern", intern_overlap=True, memory_pool=True),
+    zero1=dict(size=2, fsdp=False),
+    tensor=dict(size=1, sp="intern", intern_overlap=False, memory_pool=False),
     pipeline=dict(size=1, interleaved_overlap=True),
+    weight=dict(size=8, overlap=True, memory_pool=True),
+    sequence=4,
 )
 
 cudnn_deterministic = False
diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py
index 6f1142cb..e17b4ba3 100644
--- a/internlm/core/context/__init__.py
+++ b/internlm/core/context/__init__.py
@@ -1,6 +1,7 @@
 from .parallel_context import (
     IS_SEQUENCE_PARALLEL,
     IS_TENSOR_PARALLEL,
+    IS_WEIGHT_PARALLEL,
     Config,
     ParallelContext,
     global_context,
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 633dfe40..8d34f608 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -26,6 +26,7 @@
 
 IS_TENSOR_PARALLEL = "is_tensor_parallel"
 IS_SEQUENCE_PARALLEL = "is_sequence_parallel"
+IS_WEIGHT_PARALLEL = "is_weight_parallel"
 
 logger = get_logger(__file__)
 
@@ -289,10 +290,15 @@ def is_first_rank(self, parallel_mode: ParallelMode):
 
     def is_rank_for_log(self):
         """Returns a boolean value indicating whether the current device should print log."""
+        # is_log_rank = (
+        #     self.is_first_rank(ParallelMode.DATA)
+        #     and self.is_first_rank(ParallelMode.TENSOR)
+        #     and self.is_last_rank(ParallelMode.PIPELINE)
+        # )
         is_log_rank = (
-            self.is_first_rank(ParallelMode.DATA)
-            and self.is_first_rank(ParallelMode.TENSOR)
-            and self.is_last_rank(ParallelMode.PIPELINE)
+            self.is_first_rank(ParallelMode.WEIGHT)
+            and self.is_first_rank(ParallelMode.DATA)
+            and self.is_first_rank(ParallelMode.WEIGHT_DATA)
         )
         return is_log_rank
 
@@ -426,11 +432,11 @@ def check_sanity(self):
         pps = self.pipeline_parallel_size
         tps = self.tensor_parallel_size
         ws = self.world_size
-        assert ws == dps * pps * tps, (
-            f"Expected the world size {ws} to be equal to data"
-            f" parallel size ({dps}) * pipeline parallel size "
-            f"({pps}) * tensor parallel size ({tps})"
-        )
+        # assert ws == dps * pps * tps, (
+        #     f"Expected the world size {ws} to be equal to data"
+        #     f" parallel size ({dps}) * pipeline parallel size "
+        #     f"({pps}) * tensor parallel size ({tps})"
+        # )
         assert self.zero1_parallel_size > 0
         assert self.data_parallel_size % self.zero1_parallel_size == 0
 
@@ -467,20 +473,23 @@ def init_parallel_groups(self):
         # set parallel size as attributes for global context
         parallel_config = self.config.get("parallel", None)
         if parallel_config is not None:
+            self._set_parallel_size_from_config(parallel_config, "weight", "weight_parallel_size")
+            self._set_parallel_size_from_config(parallel_config, "sequence", "sequence_parallel_size")
             self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size")
             self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size")
             self._set_parallel_size_from_config(parallel_config, "zero1", "zero1_parallel_size")
 
         # the user should not set the data parallel size manually
         # instead, it should be calculated based on other parallel config
-        self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
+        assert self.tensor_parallel_size == 1
+        assert self.pipeline_parallel_size == 1
+        assert self.zero1_parallel_size >= 1
+        self.data_parallel_size = self.world_size // self.sequence_parallel_size
+        self.weight_data_parallel_size = self.world_size // self.weight_parallel_size
 
         # the recommended nettest_parallel_size is 32 GPUs
         self.nettest_parallel_size = 32
 
-        if self.zero1_parallel_size <= 0:
-            self.zero1_parallel_size = self.data_parallel_size
-
         assert (
             self.data_parallel_size % self.config.model.get("num_experts", 1) == 0
             or self.config.model.get("num_experts", 1) % self.data_parallel_size == 0
@@ -496,6 +505,8 @@ def init_parallel_groups(self):
         initializer_args = [
             rank,
             world_size,
+            self.weight_parallel_size,
+            self.sequence_parallel_size,
             self.data_parallel_size,
             self.pipeline_parallel_size,
             self.tensor_parallel_size,
@@ -506,7 +517,10 @@ def init_parallel_groups(self):
 
         # run initialization of different process groups
         initializers = []
+        initializers.append(pgroup_initializer.Initializer_Weight(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Sequence(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Data(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
@@ -573,6 +587,7 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False):
         if dpseed_with_tpoffset:
             dp_seed = seed + pipeline_offset * 1024
         add_seed(ParallelMode.DATA, dp_seed)
+        add_seed(ParallelMode.WEIGHT_DATA, dp_seed)
         add_seed(ParallelMode.DUMMY, dp_seed)
 
         # model parallel seeds are different across ranks
diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py
index e9afa2ec..ee81ac58 100644
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@@ -51,6 +51,15 @@ class ParallelMode(Enum):
     # dummy mode, only used during mode construction
     DUMMY = "dummy"
 
+    # weight parallel
+    WEIGHT = "weight"
+
+    # weight data parallel
+    WEIGHT_DATA = "weight_data"
+
+    # sequence parallel
+    SEQUENCE = "sequence"
+
 
 class ProcessGroupInitializer(ABC):
     """An object, knowing the parallelism configuration, that initializes parallel groups.
@@ -69,6 +78,8 @@ def __init__(
         self,
         rank: int,
         world_size: int,
+        weight_parallel_size: int,
+        sequence_parallel_size: int,
         data_parallel_size: int,
         pipeline_parallel_size: int,
         tensor_parallel_size: int,
@@ -78,6 +89,8 @@ def __init__(
     ):
         self.rank = rank
         self.world_size = world_size
+        self.weight_parallel_size = weight_parallel_size
+        self.sequence_parallel_size = sequence_parallel_size
         self.data_parallel_size = data_parallel_size
         self.pipeline_parallel_size = pipeline_parallel_size
         self.tensor_parallel_size = tensor_parallel_size
@@ -91,59 +104,59 @@ def init_dist_group(self, use_cpu: bool = False):
         pass
 
 
-class Initializer_Data(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for data parallelism.
-
-    Args:
-        rank (int): The rank of current process.
-        world_size (int): Size of whole communication world.
-        data_parallel_size (int): Size of data parallel.
-        pipeline_parallel_size (int): Size of pipeline parallel.
-        tensor_parallel_size (int): Size of tensor parallel.
-        zero1_parallel_size (int): Size of zero1 parallel.
-        expert_parallel_size (int): Size of expert parallel.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
-
-        assert self.world_size % self.data_parallel_size == 0
-
-    def init_dist_group(self, use_cpu: bool = False):
-        """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
-
-        Returns:
-            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-                A Data parallelism's information tuple.
-        """
-        local_rank = None
-        ranks_in_group = None
-        process_group = None
-        cpu_group = None
-        group_world_size = None
-        mode = ParallelMode.DATA
-
-        for i in range(self.rank_num_per_dp_group):
-            ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)]
-            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-            if use_cpu:
-                group_cpu = (
-                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                    if dist.get_backend() != "gloo"
-                    else group
-                )
-            else:
-                group_cpu = None
-
-            if self.rank in ranks:
-                local_rank = ranks.index(self.rank)
-                group_world_size = len(ranks)
-                process_group = group
-                cpu_group = group_cpu
-                ranks_in_group = ranks
-
-        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+# class Initializer_Data(ProcessGroupInitializer):
+#     """A ProcessGroupInitializer for data parallelism.
+
+#     Args:
+#         rank (int): The rank of current process.
+#         world_size (int): Size of whole communication world.
+#         data_parallel_size (int): Size of data parallel.
+#         pipeline_parallel_size (int): Size of pipeline parallel.
+#         tensor_parallel_size (int): Size of tensor parallel.
+#         zero1_parallel_size (int): Size of zero1 parallel.
+#         expert_parallel_size (int): Size of expert parallel.
+#     """
+
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
+
+#         assert self.world_size % self.data_parallel_size == 0
+
+#     def init_dist_group(self, use_cpu: bool = False):
+#         """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
+
+#         Returns:
+#             Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+#                 A Data parallelism's information tuple.
+#         """
+#         local_rank = None
+#         ranks_in_group = None
+#         process_group = None
+#         cpu_group = None
+#         group_world_size = None
+#         mode = ParallelMode.DATA
+
+#         for i in range(self.rank_num_per_dp_group):
+#             ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)]
+#             group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+#             if use_cpu:
+#                 group_cpu = (
+#                     dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+#                     if dist.get_backend() != "gloo"
+#                     else group
+#                 )
+#             else:
+#                 group_cpu = None
+
+#             if self.rank in ranks:
+#                 local_rank = ranks.index(self.rank)
+#                 group_world_size = len(ranks)
+#                 process_group = group
+#                 cpu_group = group_cpu
+#                 ranks_in_group = ranks
+
+#         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
 
 class Initializer_Model(ProcessGroupInitializer):
@@ -329,6 +342,8 @@ class Initializer_Zero1(ProcessGroupInitializer):
     Args:
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
@@ -338,11 +353,12 @@ class Initializer_Zero1(ProcessGroupInitializer):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
-        self.num_zero1_parallel_group = self.data_parallel_size // self.zero1_parallel_size
+        self.num_zero1_parallel_group = self.world_size // self.zero1_parallel_size
+        self.weight_zero1_size = self.weight_parallel_size * self.zero1_parallel_size
+        self.num_weight_zero1_parallel_group = self.world_size // self.weight_zero1_size
 
-        assert self.world_size % self.data_parallel_size == 0
         assert self.world_size % self.zero1_parallel_size == 0
+        assert self.world_size % self.weight_zero1_size == 0
 
     def init_dist_group(self, use_cpu: bool = False):
         """Initialize zero1 parallel groups, and assign local_ranks and groups to each gpu.
@@ -350,6 +366,11 @@ def init_dist_group(self, use_cpu: bool = False):
         Returns:
             Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                 A zero1 parallelism's information tuple.
+
+        n=32 wp=8 sp=4 zo1=2
+        wp grops: [0-7] [8-15] [16-23] [24-31]
+        zo1 groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
+                    [16,24] [17,25] [18,26] [19,27] [20,28] [21,29] [22,30] [23,31]
         """
         local_rank = None
         ranks_in_group = None
@@ -358,10 +379,10 @@ def init_dist_group(self, use_cpu: bool = False):
         group_world_size = None
         mode = ParallelMode.ZERO1
 
-        for i in range(self.rank_num_per_dp_group):
-            for j in range(self.num_zero1_parallel_group):
+        for i in range(self.num_weight_zero1_parallel_group):
+            for j in range(self.weight_parallel_size):
                 ranks = [
-                    i + (j * self.zero1_parallel_size + k) * self.rank_num_per_dp_group
+                    i * self.weight_zero1_size + j + k * self.weight_parallel_size
                     for k in range(self.zero1_parallel_size)
                 ]
                 group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
@@ -658,3 +679,242 @@ def init_dist_group(self, use_cpu: bool = False):
                 ranks_in_group = ranks
 
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
+class Initializer_Weight(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for model weight parallelism.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        sequence_parallel_size (int): Size of data sequence parallel.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+        expert_parallel_size (int): Size of expert parallel.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_weight_parallel_group = self.world_size // self.weight_parallel_size
+
+        assert self.world_size % self.weight_parallel_size == 0
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize model weight parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Weight parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.WEIGHT
+
+        for i in range(self.num_weight_parallel_group):
+            ranks = [i * self.weight_parallel_size + j for j in range(self.weight_parallel_size)]
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
+class Initializer_Sequence(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for data sequence parallelism.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        sequence_parallel_size (int): Size of data sequence parallel.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+        expert_parallel_size (int): Size of expert parallel.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_sequence_parallel_group = self.world_size // self.sequence_parallel_size
+
+        assert self.world_size % self.sequence_parallel_size == 0
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize data sequence parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Sequence parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.SEQUENCE
+
+        for i in range(self.num_sequence_parallel_group):
+            ranks = [i * self.sequence_parallel_size + j for j in range(self.sequence_parallel_size)]
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
+class Initializer_Data(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for data parallelism.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        sequence_parallel_size (int): Size of data sequence parallel.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+        expert_parallel_size (int): Size of expert parallel.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_dp_group = self.sequence_parallel_size
+
+        assert self.world_size % self.data_parallel_size == 0
+        assert self.world_size % self.sequence_parallel_size == 0
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Data parallelism's information tuple.
+
+        n=32 wp=8 sp=4 zo1=2
+        wp grops: [0-7] [8-15] [16-23] [24-31]
+        data groups: [0,4,8,12,16,20,24,28] [1,5,9,13,17,21,25,29] [2,6,10,14,18,22,26,30] [3,7,11,15,19,23,27,31]
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.DATA
+
+        for i in range(self.num_dp_group):
+            ranks = [i + j * self.sequence_parallel_size for j in range(self.data_parallel_size)]
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
+class Initializer_Weight_Data(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for common weight's data parallelism.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        sequence_parallel_size (int): Size of data sequence parallel.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+        expert_parallel_size (int): Size of expert parallel.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_weight_dp_group = self.weight_parallel_size
+        self.weight_data_parallel_size = self.world_size // self.num_weight_dp_group
+
+        assert self.world_size % self.weight_parallel_size == 0
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize weight's data parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A WEIGHT_DATA parallelism's information tuple.
+
+        n=32 wp=8 sp=4 zo1=2
+        wp grops: [0-7] [8-15] [16-23] [24-31]
+        weight data groups: [0,8,16,24] [1,9,17,25] [2,10,18,26] [3,11,19,27]
+                            [4,12,20,28] [5,13,21,29] [6,14,22,30] [7,15,23,31]
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.WEIGHT_DATA
+
+        for i in range(self.num_weight_dp_group):
+            ranks = [i + j * self.weight_parallel_size for j in range(self.weight_data_parallel_size)]
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 4eef4ded..208af18f 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -8,9 +8,11 @@
 from typing import Dict, Union
 
 import torch
+from torch.distributed import get_rank
 
 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
+from internlm.core.context.process_group_initializer import ParallelMode
 from internlm.monitor import initialize_light_monitor
 from internlm.utils.common import get_master_node
 from internlm.utils.logger import get_logger
@@ -436,6 +438,11 @@ def launch(
                 f"number of local experts: {gpc.config.model.num_experts//gpc.expert_parallel_size}"
             )
 
+    print(
+        f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} weight_dp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}",
+        flush=True,
+    )
+
 
 def launch_from_slurm(
     config: Union[str, Path, Config, Dict],
diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py
index d1770538..ad6823b6 100644
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@@ -17,6 +17,52 @@
 from .utils import gather_forward_split_backward, split_forward_gather_backward
 
 
+# class Embedding1D(nn.Module):
+#     """
+#     1D Embedding.
+
+#     Args:
+#         num_embeddings (int): The size of vocab.
+#         embedding_dim (int): The dimention of model.
+#         padding_idx (int): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+#                             therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+#                             i.e. it remains as a fixed "pad". None by default.
+#         dtype (Optional[torch.dtype]): Data type None by default.
+
+#     """
+
+#     def __init__(
+#         self,
+#         num_embeddings: int,
+#         embedding_dim: int,
+#         *args,
+#         padding_idx: int = None,
+#         dtype: torch.dtype = None,
+#         **kwargs,
+#     ):
+#         super().__init__()
+
+#         self.num_embeddings = num_embeddings
+#         self.embed_dim = embedding_dim
+#         embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size
+
+#         self.padding_idx = padding_idx
+#         self.embed_args = args
+#         self.embed_kwargs = kwargs
+
+#         self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype))
+
+#     def forward(self, input_: Tensor) -> Tensor:
+#         output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+
+#         output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
+
+#         if gpc.config.parallel.sequence_parallel:
+#             output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
+
+#         return output
+
+
 class Embedding1D(nn.Module):
     """
     1D Embedding.
@@ -44,7 +90,7 @@ def __init__(
 
         self.num_embeddings = num_embeddings
         self.embed_dim = embedding_dim
-        embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size
+        embed_dim_per_partition = embedding_dim // gpc.weight_parallel_size
 
         self.padding_idx = padding_idx
         self.embed_args = args
@@ -53,12 +99,10 @@ def __init__(
         self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype))
 
     def forward(self, input_: Tensor) -> Tensor:
-        output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
-
-        output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
+        input_ = split_forward_gather_backward(input_, ParallelMode.SEQUENCE, dim=1)
 
-        if gpc.config.parallel.sequence_parallel:
-            output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
+        weight = gather_forward_split_backward(self.weight, ParallelMode.WEIGHT, dim=-1)
+        output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
 
         return output
 
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index b92b2ee5..0948ee9c 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -77,6 +77,29 @@ def forward(self, input, gather_dim=0):  # pylint: disable=W0622
         )
 
 
+class FSTPScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
+    """
+    ScaleColumnParallelLinear in flash implementation.
+    """
+
+    def forward(self, input, gather_dim=0):  # pylint: disable=W0622
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        if self.weight_scale != 1:
+            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
+        else:
+            weight = self.weight
+        return fstp_fused_dense_func(
+            input,
+            weight,
+            self.bias,
+            process_group=self.process_group,
+            module=self,
+            handler=gpc.fstp_handler,
+        )
+
+
 class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
     """
     ScaleColumnParallelLinear in megatron implementation.
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index bd335c1a..4cb20999 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -9,7 +9,7 @@
 from flash_attn.modules.mlp import ParallelFusedMLP
 from torch import nn
 
-from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
 from internlm.model.embedding import Embedding1D
@@ -17,6 +17,7 @@
     MegatronScaleColumnParallelLinear,
     RewardModelLinear,
     ScaleColumnParallelLinear,
+    FSTPScaleColumnParallelLinear,
     get_mlp_cls,
 )
 from internlm.model.multi_head_attention import MHA
@@ -90,7 +91,8 @@ def __init__(
         self.mixer = MHA(
             embed_dim=hidden_size,
             num_heads=num_attention_heads,
-            process_group=gpc.get_group(ParallelMode.TENSOR),
+            process_group=gpc.get_group(ParallelMode.WEIGHT),
+            sequence_process_group=gpc.get_group(ParallelMode.SEQUENCE),
             dropout=attn_drop_rate,
             max_position_embeddings=max_position_embeddings,
             softmax_scale=1 / math.sqrt(head_dim),
@@ -119,7 +121,7 @@ def __init__(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
                 out_features=hidden_size,
-                process_group=gpc.get_group(ParallelMode.TENSOR),
+                process_group=gpc.get_group(ParallelMode.WEIGHT),
                 bias=False,
                 device=device,
                 dtype=dtype,
@@ -142,6 +144,8 @@ def __init__(
         for _, param in self.mlp.named_parameters():
             if gpc.get_world_size(ParallelMode.TENSOR) > 1:
                 setattr(param, IS_TENSOR_PARALLEL, True)
+            if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
+                setattr(param, IS_WEIGHT_PARALLEL, True)
         for param in self.norm1.parameters():
             if gpc.config.parallel.sequence_parallel is True:
                 setattr(param, IS_SEQUENCE_PARALLEL, True)
@@ -312,11 +316,12 @@ def __init__(
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            head_cls = (
-                ScaleColumnParallelLinear
-                if self.sp_mode in ["flash-attn", "none", "intern"]
-                else MegatronScaleColumnParallelLinear
-            )
+            # head_cls = (
+            #     ScaleColumnParallelLinear
+            #     if self.sp_mode in ["flash-attn", "none", "intern"]
+            #     else MegatronScaleColumnParallelLinear
+            # )
+            head_cls = FSTPScaleColumnParallelLinear
         if first:
             if embed_split_hidden:
                 self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
@@ -335,6 +340,8 @@ def __init__(
                 normal_(std=0.0052)(param)
                 if gpc.get_world_size(ParallelMode.TENSOR) > 1:
                     setattr(param, IS_TENSOR_PARALLEL, True)
+                if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
+                    setattr(param, IS_WEIGHT_PARALLEL, True)
         self.embed_grad_scale = embed_grad_scale
         self.blocks = nn.ModuleList(
             [
@@ -370,7 +377,7 @@ def __init__(
             self.head = head_cls(
                 in_features=hidden_size,
                 out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
-                process_group=gpc.get_group(ParallelMode.TENSOR),
+                process_group=gpc.get_group(ParallelMode.WEIGHT),
                 bias=False,
                 device=device,
                 dtype=dtype,
@@ -380,6 +387,8 @@ def __init__(
                 normal_(std=0.0052)(param)
                 if gpc.get_world_size(ParallelMode.TENSOR) > 1:
                     setattr(param, IS_TENSOR_PARALLEL, True)
+                if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
+                    setattr(param, IS_WEIGHT_PARALLEL, True)
             for param in self.norm.parameters():
                 if gpc.config.parallel.sequence_parallel is True:
                     setattr(param, IS_SEQUENCE_PARALLEL, True)
@@ -410,6 +419,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             # if the sequence parallel mode is 'intern', the indexes should also be split in sequence dimension.
             if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern":
                 indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
+            if gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1:
+                indexes = split_forward_gather_backward(indexes, ParallelMode.SEQUENCE, dim=0)
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
 
@@ -431,6 +442,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             else:  # Training
                 hidden_states = self.head(hidden_states, gather_dim=0)
 
+        hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.SEQUENCE, dim=0)
+
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
         return hidden_states
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 93dbf010..d06cd967 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -10,6 +10,8 @@
 import torch.nn.functional as F
 from einops import rearrange
 
+from internlm.core.context import IS_WEIGHT_PARALLEL
+
 try:
     from flash_attn.flash_attn_interface import flash_attn_unpadded_func
 except ImportError:
@@ -160,6 +162,7 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         process_group: Optional[torch.distributed.ProcessGroup],
+        sequence_process_group: Optional[torch.distributed.ProcessGroup],
         max_position_embeddings: int = 2048,
         dropout: float = 0.0,
         softmax_scale: float = None,
@@ -216,8 +219,10 @@ def __init__(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
         if sp_mode == "intern":
-            self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group)
-            self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group)
+            self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=sequence_process_group)
+            self.inner_cross_attn = DistributedAttention(
+                self.inner_cross_attn, sequence_process_group=sequence_process_group
+            )
 
         # output projection always have the bias (for now)
         out_proj_cls = get_linear_cls(sp_mode, "row")
@@ -234,6 +239,10 @@ def __init__(
             for name in ["out_proj", "Wqkv"]:
                 for param in getattr(self, name).parameters():
                     setattr(param, IS_TENSOR_PARALLEL, True)
+        if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
+            for name in ["out_proj", "Wqkv"]:
+                for param in getattr(self, name).parameters():
+                    setattr(param, IS_WEIGHT_PARALLEL, True)
 
     def forward(self, x, seqlen=None, inference_params=None, **kwargs):
         if kwargs.get("indexes", None) is not None:
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 715fa467..b2131a74 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -11,7 +11,7 @@
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.scheduler import SchedulerHook
 from internlm.model.embedding import Embedding1D
-from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
+from internlm.model.linear import FSTPLinear, FSTPScaleColumnParallelLinear
 from internlm.model.utils import (
     all_gather_raw,
     all_gather_raw_bias_memory_pool,
@@ -55,7 +55,11 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                 _chunk = _chunk.model
 
             for _chunk_name, children in _chunk.named_children():
-                if isinstance(children, ScaleColumnParallelLinear):
+                if isinstance(children, FSTPScaleColumnParallelLinear):
+                    setattr(children, "_fstp_name", "head")
+                    setattr(children.weight, "_fstp_reduce_scatter_str", f"head.weight")
+                    if children.bias is not None:
+                        setattr(children.bias, "_fstp_reduce_scatter_str", f"head.bias")
                     self.head.append(children)
                 elif isinstance(children, Embedding1D):
                     self.embedding.append(children)
@@ -164,7 +168,7 @@ def _get_bias_from_memory_pool(self, module: nn.Module):
         return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
 
     def get_weight_all_gather(self, module):
-        if self.enable_memory_pool:
+        if self.enable_memory_pool and getattr(module, "_fstp_name") != "head":
             return self._get_weight_from_memory_pool(module)
         else:
             return self.weight_global_output[module]
@@ -201,7 +205,7 @@ def release_reduce_scatter_memory(self, key, index):
         self.reduce_scatter_memory_pool[key][index].idle = True
 
     def _all_gather_module_weight(self, module):
-        if self.enable_memory_pool:
+        if self.enable_memory_pool and getattr(module, "_fstp_name") != "head":
             if module.bias is not None:
                 bias_handle = all_gather_raw_bias_memory_pool(
                     module.bias,
@@ -319,6 +323,16 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
             _clear_handle(module)
             _clear_weight(module)
 
+        def _pre_hook_for_head(module: nn.Module, inputs: Any):  # pylint: disable=W0613
+            if module not in self.weight_global_handle:
+                self._all_gather_module_weight(module)
+
+            _wait_handle(module)
+
+        def _post_hook_for_head(module, grad_input, grad_output):  # pylint: disable=W0613
+            _clear_handle(module)
+            _clear_weight(module)
+
         # register forward hooks
         # 1. register post_forward_hook @embedding module to prefetch for block 0
         # 2. register pre_forward_hook @out_proj module to prefetch for next block,
@@ -354,6 +368,12 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
                 module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
                 module.register_full_backward_hook(_post_backward_hook_for_module)
 
+        for head in self.head:
+            head.register_forward_pre_hook(_pre_hook_for_head)
+            head.register_full_backward_pre_hook(_pre_hook_for_head)
+            head.register_forward_hook(_post_hook_for_head)
+            head.register_full_backward_hook(_post_hook_for_head)
+
 
 class FSTPOverlapSchedulerHook(SchedulerHook):
     """
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 45d2f51a..89980c07 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -494,7 +494,7 @@ def forward(
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
         total_x = x.contiguous()
 
-        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        world_size = gpc.get_world_size(ParallelMode.WEIGHT)
         if world_size > 1:
             # do all_gather for weight and bias before actual computation
             if overlap_handler is not None:
@@ -556,7 +556,7 @@ def backward(ctx, grad_output, *args):
         batch_dim = batch_shape.numel()
         grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
 
-        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        world_size = gpc.get_world_size(ParallelMode.WEIGHT)
         if world_size > 1:
             if overlap_handler is not None:
                 total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)
diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py
new file mode 100644
index 00000000..7ab9823b
--- /dev/null
+++ b/internlm/solver/optimizer/hybrid_zero_optim2.py
@@ -0,0 +1,983 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+from functools import partial
+from typing import List, Optional
+
+import torch
+import torch.distributed as dist
+from torch.optim import Optimizer
+
+from internlm.core.context import IS_SEQUENCE_PARALLEL, Config, ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.monitor import send_alert_message
+from internlm.solver.optimizer.store import (
+    BucketStore,
+    GradientStore,
+    ParameterStore,
+    TensorBucket,
+)
+from internlm.solver.optimizer.utils import (
+    DynamicGradScaler,
+    ParamBcastSyncHandler,
+    flatten,
+    get_grad_accumulate_object,
+    has_inf_or_nan,
+    reduce_tensor,
+    release_param_grad,
+    split_half_float_double,
+    sync_param,
+)
+from internlm.utils.common import get_current_device
+from internlm.utils.logger import get_logger
+from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.timeout import llm_timeout
+
+from .base_optimizer import BaseOptimizer
+from .utils import compute_layer_norm, compute_norm, compute_param_norm
+
+inf = math.inf
+logger = get_logger(__file__)
+
+
+class HybridZeroOptimizer2(BaseOptimizer):
+    """
+    Hybrid Zero Optimizer.
+    """
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        cpu_offload=False,
+        grad_scal_cfg: Config = None,
+        zero_cfg: Config = None,
+        param_bcast_sync_handler: ParamBcastSyncHandler = None,
+    ):
+        # DynamicGradScaler related args
+        if gpc.config.model.dtype is torch.float32:
+            initial_scale = 1
+        else:
+            initial_scale = grad_scal_cfg.fp16.initial_scale
+        min_scale = grad_scal_cfg.fp16.min_scale
+        growth_interval = grad_scal_cfg.fp16.growth_interval
+        growth_factor = grad_scal_cfg.growth_factor
+        backoff_factor = grad_scal_cfg.backoff_factor
+        hysteresis = grad_scal_cfg.hysteresis
+        max_scale = grad_scal_cfg.max_scale
+
+        # Zero related args
+        reduce_bucket_size = zero_cfg.reduce_bucket_size
+        clip_grad_norm = zero_cfg.clip_grad_norm
+        self._overlap_sync_grad = zero_cfg.overlap_sync_grad
+        self._overlap_sync_param = zero_cfg.overlap_sync_param
+
+        super().__init__(optim=optimizer)
+
+        self._cpu_offload = cpu_offload
+        self._zero_local_rank = []
+        self._zero_world_size = []
+        self._broadcast_parallel_mode = []
+
+        # ParameterStore will manage the tensor buffers used for zero
+        # it will not manage the tensors used by mixed precision training
+        self._param_store = ParameterStore(ParallelMode.ZERO1)
+        self._grad_store = GradientStore(ParallelMode.WEIGHT_DATA)
+        self._bucket_store: List[BucketStore] = []
+        self._accum_grad_buckets: List[BucketStore] = []
+        self._bucket_in_progress = []
+
+        # fp16 and fp32 params for mixed precision training
+        self._fp16_param_groups = dict()
+        self._fp32_flat_param_groups_of_current_rank = dict()
+
+        # communication params
+        # self._overlap_communication = overlap_communication
+        self._reduce_bucket_size = reduce_bucket_size
+
+        self._comm_bcast_stream = torch.cuda.Stream()
+
+        # gradient scaler
+        self.grad_scaler = DynamicGradScaler(
+            initial_scale=initial_scale,
+            min_scale=min_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            hysteresis=hysteresis,
+            max_scale=max_scale,
+        )
+        self._found_overflow = torch.cuda.FloatTensor([0], device=get_current_device())
+
+        # gradient clipping
+        self._clip_grad_norm = clip_grad_norm
+
+        # need to record the rank in which parameter groups are not assigned parameters.
+        self.param_group_has_params = []
+        self.param_group_no_params_ranks = []
+        self.padding_grad = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device())
+        self.padding_tensor = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device())
+
+        self.rank_unique_id = (
+            f"gpus-{gpc.get_world_size(ParallelMode.GLOBAL)}_"
+            + f"wp-{gpc.get_local_rank(ParallelMode.WEIGHT)}_"
+            + f"sp-{gpc.get_local_rank(ParallelMode.SEQUENCE)}_"
+            + f"dp-{gpc.get_local_rank(ParallelMode.DATA)}_"
+            + f"wdp-{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}_"
+            + f"zo-{gpc.get_local_rank(ParallelMode.ZERO1)}.pt"
+        )
+        self.params_per_rank_id_dict = []
+        self._param_bcast_sync_handler = param_bcast_sync_handler
+        if self._overlap_sync_param:
+            assert self._param_bcast_sync_handler is not None
+
+        if gpc.config.parallel.weight >= 1 and gpc.config.parallel["weight"]["overlap"] is True:
+            self._fstp_handler = gpc.fstp_handler
+        else:
+            self._fstp_handler = None
+
+        # iterate over the param group in the optimizer
+        # partition these param groups for data parallel training
+        # and add buffers to parameter store for future access
+        for group_id, param_group in enumerate(self.optim.param_groups):
+            group_params = param_group["params"]
+
+            # set the dtype for each param group
+            param_group["dtype"] = group_params[0].dtype if len(group_params) != 0 else None
+
+            # add the fp16 params to fp16_param_groups for bookkeeping
+            self._fp16_param_groups[group_id] = group_params
+
+            # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1
+            # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode
+            zero_mode = (
+                ParallelMode.ZERO1
+                if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
+                else ParallelMode.EXPERT_DATA
+            )
+            self._zero_local_rank.append(gpc.get_local_rank(zero_mode))
+            self._zero_world_size.append(gpc.get_world_size(zero_mode))
+            # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
+            self._broadcast_parallel_mode.append(zero_mode)
+            self._bucket_store.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA))
+            self._accum_grad_buckets.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA))
+
+            # assign parameters to ranks the params in the list are sorted
+            params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group)
+            self.param_group_no_params_ranks.append(no_params_ranks)
+            self.param_group_has_params.append(self._zero_local_rank[group_id] not in no_params_ranks)
+
+            # store the mapping between param to rank each param should belong to only one rank.
+            # we can skip the moe param and do not keep them in _param_store to save memory
+            # (means we need to deal with moe param in a different way), but it will increase
+            # complexity and reduce code readablity.
+            for rank, params in enumerate(params_per_rank):
+                # check whether any rank is not assigned params.
+                if len(params) != 0:
+                    self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params)
+                    for param in params:
+                        setattr(param, "group_id", group_id)
+                        self._param_store.set_param_to_rank(param, rank)
+
+            # move to cpu to make room to create the flat tensor
+            for param in group_params:
+                param.data = param.data.cpu()
+
+            # flatten the reordered tensors
+            for rank in range(self._zero_world_size[group_id]):
+                # No flat fp16 buffer is allocated if the process has no parameters.
+                if rank not in self.param_group_no_params_ranks[group_id]:
+                    tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                    with torch.no_grad():
+                        flat_tensor = flatten(tensor_list)
+                    flat_tensor = flat_tensor.data.cuda()
+                    self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
+                    sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
+
+            # create a copy of fp32 weights of the parameters for which this rank is responsible
+            # No flat fp32 buffer is allocated if the process has no parameters.
+            if self.param_group_has_params[group_id]:
+                fp16_flat_current_rank = self._param_store.get_flat_fp16_param_by_rank_group(
+                    self._zero_local_rank[group_id], group_id
+                )
+                fp32_flat_current_rank = fp16_flat_current_rank.float()
+                device = "cpu" if self._cpu_offload else get_current_device()
+                fp32_flat_current_rank = fp32_flat_current_rank.to(device)
+                fp32_flat_current_rank.requires_grad = True
+                self._fp32_flat_param_groups_of_current_rank[group_id] = fp32_flat_current_rank
+
+                # need to replace the params in the `params` field in the optimizer
+                # so that when the optimizer calls step(), it only updates the tensors
+                # managed by this data parallel rank
+                param_group["params"] = [fp32_flat_current_rank]
+
+            # set reduction state
+            for param in self._fp16_param_groups[group_id]:
+                self._param_store.set_param_reduction_state(param, False)
+
+        assert len(self._fp16_param_groups) != 0
+
+        # If a rank is not assigned any arguments, 'has_params' is False.
+        self.has_params = sum(self.param_group_has_params) != 0
+        # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled.
+        self.skip_grad_reduce = False
+
+        # reduction hook is only used if overlapping communication
+        # if it is stage 1 without overlapping, no hook will be attached
+        self._attach_reduction_hook()
+
+    @property
+    def zero_local_rank(self):
+        return self._zero_local_rank
+
+    @property
+    def zero_world_size(self):
+        return self._zero_world_size
+
+    @property
+    def loss_scale(self):
+        return self.grad_scaler.scale
+
+    @property
+    def num_param_groups(self):
+        return len(self._fp16_param_groups)
+
+    def _partition_param_list(self, group_id, param_group):
+        no_params_ranks = []
+        params_per_rank = [[] for _ in range(self._zero_world_size[group_id])]
+        numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])]
+        self.params_per_rank_id_dict.append([[] for _ in range(self._zero_world_size[group_id])])
+        param_list = param_group["params"]
+
+        sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)
+        for i, param in enumerate(sorted_params):
+            global_id = str(i)
+            for j in range(len(param.size())):
+                global_id = "_".join([global_id, str(param.size()[j])])
+            if self._overlap_sync_param:
+                rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
+            else:
+                rank_to_go = numel_per_rank.index(min(numel_per_rank))
+            params_per_rank[rank_to_go].append(param)
+            self.params_per_rank_id_dict[-1][rank_to_go].append(global_id)
+            numel_per_rank[rank_to_go] += param.numel()
+
+        # check whether any rank is not assigned to parameters.
+        for rank, params in enumerate(params_per_rank):
+            if len(params) == 0:
+                no_params_ranks.append(rank)
+
+        if gpc.is_rank_for_log():
+            logger.info(  # pylint: disable=W1203
+                f"Number of elements on ranks: {numel_per_rank}, rank:{gpc.get_global_rank()}"
+            )
+
+        return params_per_rank, set(no_params_ranks)
+
+    def _is_moe_group(self, param_group):
+        return "moe" in param_group.keys() and param_group["moe"]
+
+    def _is_norm_group(self, param_group):
+        return "norm" in param_group.keys() and param_group["norm"]
+
+    def _is_gate_group(self, param_group):
+        return "gate" in param_group.keys() and param_group["gate"]
+
+    # TODO check expert dp is correct when enable moe and overlap both
+    def _attach_reduction_hook(self):
+        # we iterate over the fp16 params
+        # on each param, we register a hook to its AccumulateGrad object
+        for group_id in range(self.num_param_groups):
+            param_group = self._fp16_param_groups[group_id]
+            for param in param_group:
+                # we should not reduce the param in moe
+                if not param.requires_grad:
+                    continue
+
+                reduce_rank = None
+
+                def _define_and_attach(param, reduce_rank=None):
+                    reduction_func = partial(
+                        self._store_and_try_reduce_grads_by_bucket,
+                        param=param,
+                        reduce_rank=reduce_rank,
+                    )
+
+                    reduce_scatter_checker = partial(
+                        self._wait_reduce_scatter_and_accumulate_grads,
+                        param=param,
+                        reduce_rank=reduce_rank,
+                    )
+
+                    def reduction_sp_func():
+                        handle = reduce_tensor(
+                            param.grad,
+                            dtype=None,
+                            dst_rank=reduce_rank,
+                            parallel_mode=ParallelMode.GLOBAL,
+                        )
+                        handle.wait()
+
+                    # define hook
+                    # NOT IMPORTANT BUT GOOD TO KNOW:
+                    # args here is not grad, but allow_unreacable and accumulate_grad
+                    def reduce_grad_hook(*args):  # pylint: disable=W0613
+                        if self.skip_grad_reduce is False:
+                            reduction_func()
+
+                    # define hook for real gradient accumulation.
+                    def accum_grad_hook(*args):  # pylint: disable=W0613
+                        reduce_scatter_checker()
+
+                    # define hook for sequence_parallel
+                    def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
+                        if self.skip_grad_reduce is False:
+                            reduction_sp_func()
+
+                    # get the AccumulateGrad object of the param itself
+                    # If these objects are not kept, reduction hooks may not be attached successfully.
+                    accum_grad_obj = get_grad_accumulate_object(param)
+                    self._grad_store.add_accumulate_grad_object(accum_grad_obj)
+
+                    # if sequence_parallel is True,
+                    # the grad of norm should be all-reduce across the tp process group
+                    if (
+                        gpc.config.parallel.sequence_parallel is True
+                        and hasattr(param, IS_SEQUENCE_PARALLEL)
+                        and getattr(param, IS_SEQUENCE_PARALLEL) is True
+                    ):
+                        accum_grad_obj.register_hook(reduce_grad_hook_sp)
+
+                    # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
+                    # we must keep up with reduce_grad_hook.
+                    if self._fstp_handler is not None:
+                        accum_grad_obj.register_hook(accum_grad_hook)
+
+                    if self._overlap_sync_grad:
+                        accum_grad_obj.register_hook(reduce_grad_hook)
+
+                _define_and_attach(param, reduce_rank)
+
+    def accumulate_left_grads_after_backward(self):
+        if self._fstp_handler is None:
+            return
+
+        for group_id in range(self.num_param_groups):
+            self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id])
+
+    def belongs_to_current_rank(self, param) -> bool:
+        """
+        Check whether a parameter is supposed to be updated by the process of the current rank
+
+        :param tensor: A :class:`torch.Tensor` object
+        :type tensor: torch.Tensor
+
+        :return: True if the parameter should be updated by the current rank. Otherwise false.
+        :rtype: bool
+        """
+        tensor_rank = self._param_store.get_param_rank(param)
+        group_id = getattr(param, "group_id")
+        return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id])
+
+    def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None:
+        for _param in bucket.get_param(reduce_rank):
+            if not hasattr(_param, "_fstp_reduce_scatter_str"):
+                continue
+
+            # wait and accumulate gardient.
+            _key = getattr(_param, "_fstp_reduce_scatter_str")
+            _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key]
+            _comm_handle.wait()
+            _param.grad.add_(_grad)
+
+            # release cuda memory.
+            if self._fstp_handler.enable_memory_pool:
+                self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
+            _grad = None
+            self._fstp_handler.reduce_scatter_handlers[_key] = None
+
+        bucket.reset_by_rank(reduce_rank)
+
+    def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional[int] = None):
+        param_size = param.numel()
+
+        group_id = getattr(param, "group_id")
+        current_bucket = self._accum_grad_buckets[group_id]
+
+        # check if the bucket is full
+        # if full, will reduce the grads already in the bucket
+        # after reduction, the bucket will be empty
+        if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+            self._accum_grads_store_in_bucket(current_bucket, reduce_rank)
+
+        # otherwise, add the parameter into bucket.
+        current_bucket.add_num_elements_in_bucket(param_size, reduce_rank)
+        current_bucket.add_param(param, reduce_rank)
+
+    def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
+        param_size = param.numel()
+
+        # check if the bucket is full
+        # if full, will reduce the grads already in the bucket
+        # after reduction, the bucket will be empty
+        group_id = getattr(param, "group_id")
+        current_bucket = self._bucket_store[group_id]
+
+        if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+            self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False)
+
+        # the param must not be reduced to ensure correctness
+        is_param_reduced = self._param_store.is_param_reduced(param)
+        if is_param_reduced:
+            msg = (
+                f"Parameter of size ({param.size()}) has already been reduced, "
+                + "duplicate reduction will lead to arithmetic incorrectness"
+            )
+            raise RuntimeError(msg)
+
+        # the param must have grad for reduction
+        assert param.grad is not None, f"Parameter of size ({param.size()}) has None grad, cannot be reduced"
+
+        current_bucket.add_num_elements_in_bucket(param_size, reduce_rank)
+        current_bucket.add_grad(param.grad, reduce_rank)
+        current_bucket.add_param(param, reduce_rank)
+
+    def _reduce_grads_stored_in_bucket(self, current_bucket, reduce_rank=None, last_bucket=False):
+        # reduce grads
+        self._reduce_grads_by_rank(
+            reduce_rank=reduce_rank,
+            grads=current_bucket.get_grad(reduce_rank=reduce_rank),
+            bucket_size=current_bucket.num_elements_in_bucket(reduce_rank),
+            group_id=current_bucket.get_param_group_id(),
+            dp_parallel_mode=current_bucket.get_dp_parallel_mode(),
+        )
+
+        params_in_bucket = current_bucket.get_param(reduce_rank=reduce_rank)
+
+        for param in params_in_bucket:
+            # the is_param_reduced flag should be False showing that
+            # this param is not reduced before calling self._reduce_grads_by_rank
+            is_param_reduced = self._param_store.is_param_reduced(param)
+
+            if is_param_reduced:
+                msg = (
+                    f"Parameter of size ({param.size()}) has been reduced, "
+                    + "duplicate reduction will lead to arithmetic incorrectness"
+                )
+                raise RuntimeError(msg)
+
+            # update the flag
+            self._param_store.set_param_reduction_state(param, True)
+
+            if self.belongs_to_current_rank(param):
+                self._param_store.add_reduced_param_for_compute_norm(param, last_bucket)
+            else:
+                self._param_store.add_previous_reduced_param(param)
+
+        current_bucket.reset_by_rank(reduce_rank)
+
+    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size, group_id, dp_parallel_mode):
+        grad_buckets_by_dtype = split_half_float_double(grads)
+        next_bucket_list = []
+        # add parameters into bucket for reduction
+        for tensor_list in grad_buckets_by_dtype:
+            param_bucket = TensorBucket(size=bucket_size)
+            for tensor in tensor_list:
+                param_bucket.add_to_bucket(tensor, allow_oversize=True)
+            if not param_bucket.is_empty():
+                self._reduce_and_copy(
+                    bucket=param_bucket, reduce_rank=reduce_rank, group_id=group_id, dp_parallel_mode=dp_parallel_mode
+                )
+            next_bucket_list.append(param_bucket)
+
+        # wait for the completion of previouce bucket list reduction, and do unflatten_and_copy()
+        # here we can also overlap the communication with some memcpy operation caused by bucket.flatten()
+        for bucket in self._bucket_in_progress:
+            bucket.commu_handle.wait()
+            bucket.unflatten_and_copy()
+            bucket.empty()
+        self._bucket_in_progress = []
+        self._param_store.clear_grads_of_previous_reduced_params()
+
+        # after the completion of bucket list reduction, add new buckets into _bucket_in_progress
+        self._bucket_in_progress = next_bucket_list.copy()
+
+    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank, group_id, dp_parallel_mode):
+        # flatten the tensors and do allreduce
+        bucket.flatten()
+        bucket.commu_handle = reduce_tensor(
+            tensor=bucket.get_flat_tensor(),
+            dtype=None,
+            dst_rank=reduce_rank,
+            parallel_mode=dp_parallel_mode,
+        )
+
+        # update the reduced tensor
+        if reduce_rank is None or reduce_rank == self._zero_local_rank[group_id]:
+            bucket.set_unflatten_and_copy_flag(flag=True)
+
+    def _has_inf_or_nan(self, tensor):
+        try:
+            tensor_mean = float(tensor.mean())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if "value cannot be converted" not in instance.args[0]:
+                raise
+            return True
+        else:
+            if tensor_mean == float("inf") or tensor_mean == -float("inf"):
+                return True
+            return False
+
+    def _sync_grad(self):
+        # update param already reduced flag
+        reduction_states = self._param_store.get_param_reduction_states()
+        for tensor, _ in reduction_states.items():
+            reduction_states[tensor] = False
+        self._param_store.reset_reduced_data_for_compute_norm()
+
+        # accumulate gradient
+        avg_gradients = self._grad_store._averaged_gradients
+        for group_id in range(self.num_param_groups):
+            # the following operations are performed only on the rank to which parameters are assigned.
+            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
+                param_group = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id)
+
+                if group_id not in avg_gradients:
+                    avg_gradients[group_id] = []
+
+                param_idx = 0
+                for param in param_group:
+                    if param.grad is not None:
+                        if len(avg_gradients[group_id]) == param_idx:
+                            avg_gradients[group_id].append(param.grad)
+                        else:
+                            avg_gradients[group_id][param_idx].add_(param.grad)
+                        param_idx += 1
+
+        # the gradients needed are stored in the avg_gradients buffer
+        # thus, can clear this
+        self.zero_grad()
+
+    def zero_grad(self, set_to_none=True):
+        """
+        Set parameter gradients to zero. If set_to_none = True, gradient
+        will be set to None to save memory.
+
+        :param set_to_none: Whether set the gradient to None. Default value is True.
+        :type set_to_none: bool
+        """
+        for _, param_group in self._fp16_param_groups.items():
+            for param in param_group:
+                if set_to_none:
+                    param.grad = None
+                elif param.grad is not None:
+                    param.grad.detach()
+                    param.grad.zero_()
+                else:
+                    pass
+
+    def backward(self, loss, retain_graph=False):
+        loss = self.loss_scale * loss
+        loss.backward(retain_graph=retain_graph)
+
+        # Gradients may not be fully synchronized here.
+
+    def _compute_norm_with_stage(
+        self,
+        group_id: int = 0,
+        last_bucket: bool = False,
+        last_stage: bool = False,
+        previous_norm=None,
+    ):
+        # compute norm for gradients that have been reduced
+        params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
+        if len(params) == 0:
+            dtype = self.param_groups[group_id]["dtype"]
+            grads = [self.padding_grad.to(dtype)]
+            params = [self.padding_tensor.to(dtype)]
+
+        norm = 0
+        if self._clip_grad_norm > 0:
+            # this norm is before scaling, it will be very large
+            norm = compute_norm(
+                gradients=grads,
+                parameters=params,
+                last_stage=last_stage,
+                previous_norm=previous_norm,
+                zero_mode=self._broadcast_parallel_mode[group_id],
+            )
+
+        return norm
+
+    def _compute_param_norm_stage(
+        self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_param_norms=None
+    ):
+        # compute norm for gradients that have been reduced
+        params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
+
+        total_param_norms = {}
+        if len(params) == 0:
+            dtype = self.param_groups[group_id]["dtype"]
+            grads = [self.padding_grad.to(dtype)]
+            params = [self.padding_tensor.to(dtype)]
+
+        if self._clip_grad_norm > 0:
+            total_param_norms = compute_param_norm(
+                grads,
+                params,
+                last_stage=last_stage,
+                previous_param_norms=previous_param_norms,
+                zero_mode=self._broadcast_parallel_mode[group_id],
+                is_moe_group=self._is_moe_group(self.optim.param_groups[group_id]),
+            )
+        return total_param_norms
+
+    @llm_timeout(func_name="optim_step")
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        Returns:
+            Union[bool, float]: Whether the gradient is success updated, and the gradient.
+        """
+        assert closure is None, "closure is not supported by step()"
+
+        # if not overlapping communication (no reduction hook is attached)
+        # we need to manually reduce these gradients
+        if not self._overlap_sync_grad:
+            for group_id in range(len(self._fp16_param_groups)):
+                for param in self._fp16_param_groups[group_id]:
+                    # we should not reduce the param in moe
+                    if param.grad is not None:
+                        self._store_and_try_reduce_grads_by_bucket(param)
+
+        # we need to reduce the gradients left in the communication bucket
+        for group_id in range(self.num_param_groups):
+            self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
+
+        # compute norm for gradients in the before bucket
+        groups_norms = []
+        groups_param_norms = []
+        for group_id in range(self.num_param_groups):
+            groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
+            if gpc.config.get("grad_norm_profiling", False):
+                groups_param_norms.append(self._compute_param_norm_stage(group_id=group_id))
+
+        # clear reduced grads
+        # grads in the last bucket is reduced
+        for bucket in self._bucket_in_progress:
+            bucket.commu_handle.wait()
+            bucket.unflatten_and_copy()
+            bucket.empty()
+        self._bucket_in_progress = []
+        self._param_store.clear_grads_of_previous_reduced_params()
+        # compute norm for gradients in the last bucket
+        total_norms = {}
+        total_param_norms = {}
+        total_layer_norms = {}
+        for group_id in range(self.num_param_groups):
+            group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
+            group_name = f"{group_id}_{group_name}"
+            total_norms[group_name] = self._compute_norm_with_stage(
+                group_id=group_id,
+                last_bucket=True,
+                last_stage=True,
+                previous_norm=groups_norms[group_id],
+            )
+            if gpc.config.get("grad_norm_profiling", False):
+                param_norms = self._compute_param_norm_stage(
+                    group_id=group_id,
+                    last_bucket=True,
+                    last_stage=True,
+                    previous_param_norms=groups_param_norms[group_id],
+                )
+                total_layer_norms[group_name], total_param_norms[group_name] = compute_layer_norm(
+                    param_norms=param_norms, loss_scale=self.loss_scale.item()
+                )
+
+            # Need to allreduce(avg) the norms across different ranks because moe params will not be synced
+            # during allreduce
+            if self._is_moe_group(self.optim.param_groups[group_id]):
+                # model and zero have been reduced!!!
+                pg = gpc.get_group(ParallelMode.EXPERT)
+                scaled_norm = total_norms[group_name] * 1.0 / float(gpc.get_world_size(ParallelMode.EXPERT))
+                scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float)
+                dist.all_reduce(scaled_norm_tensor, group=pg)
+                total_norms[group_name] = scaled_norm_tensor.item()
+        timer("sync_grad").start()
+        self._sync_grad()
+        timer("sync_grad").stop()
+
+        state, global_norms = self._step(closure=closure, norms=total_norms)
+        if gpc.config.get("grad_norm_profiling", False):
+            global_norms["layer_norms"] = total_layer_norms
+            global_norms["param_norms"] = total_param_norms
+
+        return state, global_norms
+
+    def _step(self, closure=None, norms=None):
+        assert closure is None, "closure is not supported by step()"
+
+        # check for overflow
+        found_inf = False
+        found_nan = False
+        # if there is INF values in grades, compute_norm func would also returns -1
+        # thus, we try to avoid call _check_overflow here
+        # found_inf = self._check_overflow()
+        # Because you may encounter inf when computing norm
+
+        if -1 in norms.values():
+            found_inf = True
+
+        if -2 in norms.values():
+            found_nan = True
+
+        loss_scale = float(self.loss_scale.item())  # backup
+        if gpc.config.model.dtype is not torch.float32:
+            self.grad_scaler.update(found_inf)
+
+        # update loss scale if overflow occurs
+        if found_inf:
+            if gpc.is_rank_for_log():
+                logger.warning("Overflow occurs, please check it.")
+                send_alert_message(
+                    address=gpc.config.monitor.alert.feishu_alert_address,
+                    message="Overflow occurs, please check it.",
+                )
+            self._grad_store._averaged_gradients = dict()
+            self.zero_grad()
+            return False, norms
+
+        if found_nan:
+            if gpc.is_rank_for_log():
+                logger.warning("Nan grad norm occurs, please check it.")
+                send_alert_message(
+                    address=gpc.config.monitor.alert.feishu_alert_address,
+                    message="Nan grad norm  occurs, please check it.",
+                )
+            self._grad_store._averaged_gradients = dict()
+            self.zero_grad()
+            return False, norms
+        # copy the grad of fp16 param to fp32 param
+        single_grad_partition_groups = []
+        for group_id in range(self.num_param_groups):
+            # compute norm
+            # The following operations are performed only on the rank to which parameters are assigned.
+            if not self.param_group_has_params[group_id]:
+                continue
+
+            # create flat gradient for the flat fp32 params
+            gradients = self._grad_store.get_averaged_gradients_by_group(group_id)
+            with torch.no_grad():
+                flat_fp16_avg_grads = flatten(gradients)
+            self._grad_store.reset_average_gradients_by_group(group_id)
+            gradients = None  # release cuda memory
+
+            dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype
+            flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)
+            flat_fp16_avg_grads = None  # release cuda memory
+
+            param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape
+            assert (
+                param_shape == flat_fp32_avg_grads.shape
+            ), f"fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}"
+
+            # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
+            # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
+            is_tp_sync_groups = (
+                self._is_norm_group(self.optim.param_groups[group_id]),
+                self._is_gate_group(self.optim.param_groups[group_id]),
+            )
+            if any(is_tp_sync_groups):
+                dist.all_reduce(
+                    flat_fp32_avg_grads,
+                    op=dist.ReduceOp.AVG,
+                    group=gpc.get_group(ParallelMode.TENSOR),
+                )
+
+            single_grad_partition_groups.append(flat_fp32_avg_grads)
+            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
+            self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
+        # unscale and clip grads
+        # get the global norm
+        global_norm_groups = {}
+        if self._clip_grad_norm > 0:
+            for group_name, norm in norms.items():
+                global_norm_groups[group_name] = norm**0.5
+
+        # the following operations are performed only on the rank to which parameters are assigned.
+        if gpc.config.model.dtype is not torch.float32:
+            if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
+                self._unscale_and_clip_grads(
+                    single_grad_partition_groups,
+                    list(global_norm_groups.values()),
+                    loss_scale,
+                )
+
+        # update the parameters
+        timer("step").start()
+
+        # For those ranks that are not assigned parameters, we just wait for other ranks
+        # to send them updated their own parameters.
+        if self.has_params:
+            self.optim.step()
+            # release the fp32 grad
+            release_param_grad(self._fp32_flat_param_groups_of_current_rank.values())
+            # update fp16 partition updated by the current rank
+            for group_id in range(len(self._fp16_param_groups)):
+                if self.param_group_has_params[group_id]:
+                    fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(
+                        rank=self._zero_local_rank[group_id], group_id=group_id
+                    )
+                    fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
+                    fp16_param.data.copy_(fp32_param)
+        torch.cuda.synchronize()
+        with torch.cuda.stream(self._comm_bcast_stream):
+            self.broadcast_params()
+
+        timer("step").stop()
+
+        # update gradients may not be needed here, because the sync_params function is used in initialization,
+        # so synchronization is maintained
+        for group_name, global_norm in global_norm_groups.items():
+            global_norm_groups[group_name] = global_norm / loss_scale
+        return True, global_norm_groups
+
+    def broadcast_params(self):
+        handles = []
+
+        for group_id in range(self.num_param_groups):
+            for rank in range(self._zero_world_size[group_id]):
+                # The following operations are performed only on the rank to which parameters are assigned.
+                if rank in self.param_group_no_params_ranks[group_id]:
+                    continue
+                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
+                # grank = gpc.get_ranks_in_group(group_type)[rank]  # need to convert to the global rank
+                # assert grank == rank, f"{grank} == {rank}"
+                g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode[group_id])[rank]
+                handle = dist.broadcast(
+                    fp16_param,
+                    src=g_rank,
+                    group=gpc.get_group(self._broadcast_parallel_mode[group_id]),
+                    async_op=True,
+                )
+
+                if self._overlap_sync_param:
+                    self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
+                else:
+                    handles.append(handle)
+
+        for handle in handles:
+            handle.wait()
+
+        torch.cuda.synchronize()
+
+    ##################
+    # FP16 Utilities #
+    ##################
+
+    def _check_overflow(self):
+        # clear previous overflow record
+        self._found_overflow.fill_(0.0)
+
+        # check for overflow
+        for group_id in range(len(self._fp16_param_groups)):
+            # The following operations are performed only on the rank to which parameters are assigned.
+            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
+                for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id):
+                    if avg_grad is not None and has_inf_or_nan(avg_grad):
+                        self._found_overflow.fill_(1.0)
+                        break
+        dist.all_reduce(
+            self._found_overflow,
+            op=dist.ReduceOp.MAX,
+            group=gpc.get_group(ParallelMode.GLOBAL),
+        )
+
+        return self._found_overflow.item() > 0
+
+    def _unscale_and_clip_grads(self, grad_groups_flat, total_norm_groups, loss_scale):
+        # compute combined scale factor for this group
+        combined_scale_groups = []
+
+        if self._clip_grad_norm > 0.0:
+            # norm is in fact norm*scale
+            for group_id, total_norm in enumerate(total_norm_groups):
+                combined_scale_groups.append(loss_scale)
+                clip = ((total_norm / loss_scale) + 1e-6) / self._clip_grad_norm
+                if clip > 1.0:
+                    combined_scale_groups[group_id] = clip * loss_scale
+
+        for group_id, grad in enumerate(grad_groups_flat):
+            grad.data.mul_(1.0 / combined_scale_groups[group_id])
+
+    def clip_grad_norm(self, model, max_norm):
+        # will conduct in the step()
+        pass
+
+    def state_dict(self):
+        states = {}
+        grad_scaler = self.grad_scaler.state_dict()
+        states["grad_scaler"] = grad_scaler
+        optim_states = self.optim.state_dict()
+        states["base_optim_states"] = optim_states
+
+        flat_fp32_weights = {}
+        for group_id, param in self._fp32_flat_param_groups_of_current_rank.items():
+            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
+                assert param.grad is None
+                flat_fp32_weights[group_id] = param
+        states["flat_fp32_weights"] = flat_fp32_weights
+        states["zero_devide_optim_plan"] = self.params_per_rank_id_dict
+
+        return states
+
+    def load_state_dict(self, states):
+        # TODO: Need to take into account the change in the number of DP.
+        assert "grad_scaler" in states, "Not found grad_scaler state!"
+        grad_scaler = states["grad_scaler"]
+        self.grad_scaler.load_state_dict(grad_scaler)
+        optim_states = states["base_optim_states"]
+        self.optim.load_state_dict(optim_states)
+
+        # load fp32 model weight.
+        flat_fp32_weights = states["flat_fp32_weights"]
+        assert set(flat_fp32_weights.keys()) == set(self._fp32_flat_param_groups_of_current_rank)
+        for group_id, param in flat_fp32_weights.items():
+            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
+                self_param = self._fp32_flat_param_groups_of_current_rank[group_id]
+                assert (
+                    self_param.shape == param.shape
+                ), f"The loaded parameter shape is inconsistent, {self_param.shape} != {param.shape}"
+                self_param.data.copy_(param.data)
+
+        # Load the fp16 model weights.
+        for group_id in range(len(self._fp16_param_groups)):
+            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
+                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(
+                    rank=self._zero_local_rank[group_id], group_id=group_id
+                )
+                fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
+                fp16_param.data.copy_(fp32_param)
+
+        if "zero_devide_optim_plan" in states:
+            self.params_per_rank_id_dict = states["zero_devide_optim_plan"]
+
+
+def reload_zero_fp32_buff(optimizer):
+    # If we use AMP optimizer, we need to update its fp32 buffer as newly loaded weights value.
+    # Or we must ensure that loading model weights must be done before zero is initialized.
+    if isinstance(optimizer, HybridZeroOptimizer):
+        for group_id, param_group in enumerate(optimizer.optim.param_groups):
+            if optimizer.param_group_has_params[group_id]:
+                # flatten fp16 params have already been updated by 'load_model_checkpoint'
+                fp16_flat_current_rank = optimizer._param_store.get_flat_fp16_param_by_rank_group(
+                    optimizer._zero_local_rank[group_id], group_id
+                )
+                # param_group["params"] is fp32 flatten optimizer states of this zero rank.
+                param_group["params"][0].data.copy_(fp16_flat_current_rank.float())
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 982a2466..b2ea0391 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -17,7 +17,7 @@
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.utils.common import get_current_device, get_tensor_norm, move_norm_to_cuda
 from internlm.utils.logger import get_logger
-from internlm.utils.parallel import is_model_parallel_parameter
+from internlm.utils.parallel import is_model_parallel_parameter, is_weight_parallel_parameter
 
 logger = get_logger(__file__)
 
@@ -243,6 +243,14 @@ def append_grad(g, p):
             and gpc.get_local_rank(ParallelMode.TENSOR) == 0
         ):  # if not used in each chunk, such as layernorm
             append_grad(g, p)
+        elif (
+            gpc.is_initialized(ParallelMode.WEIGHT)
+            and not is_weight_parallel_parameter(p)
+            and gpc.get_local_rank(ParallelMode.WEIGHT) == 0
+        ):  # if not used in each chunk, such as layernorm
+            append_grad(g, p)
+        elif is_weight_parallel_parameter(p):
+            append_grad(g, p)
         elif is_model_parallel_parameter(p):
             append_grad(g, p)
         elif gpc.get_local_rank(ParallelMode.TENSOR) != 0:
@@ -312,11 +320,11 @@ def compute_norm(
             total_norm = total_norm + previous_norm
 
         # Sum across all model-parallel GPUs.
-        if gpc.is_initialized(ParallelMode.MODEL):
+        if gpc.is_initialized(ParallelMode.WEIGHT):
             dist.all_reduce(
                 total_norm,
                 op=dist.ReduceOp.SUM,
-                group=gpc.get_group(ParallelMode.MODEL),
+                group=gpc.get_group(ParallelMode.WEIGHT),
             )
 
         # This is because we use zero1, so we need to use this reduction.
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index a05f62df..96548f71 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -108,14 +108,14 @@ def initialize_model():
 
     # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
     # state in the same dp group are all the same.
-    set_mode(ParallelMode.DATA)
+    set_mode(ParallelMode.WEIGHT_DATA)
 
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
 
     gpc.fstp_handler = None
-    if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
-        gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR))
+    if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True:
+        gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.WEIGHT))
 
     return model
 
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index 6e5384f5..f6e72cff 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 from torch import nn
 
-from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context import IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 
@@ -13,13 +13,17 @@ def is_model_parallel_parameter(p):
     return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
 
 
+def is_weight_parallel_parameter(p):
+    return hasattr(p, IS_WEIGHT_PARALLEL) and getattr(p, IS_WEIGHT_PARALLEL)
+
+
 def sync_model_param(model):
     r"""Make sure data parameters are consistent during Data Parallel Mode.
 
     Args:
         model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
     """
-    if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1:
+    if gpc.is_initialized(ParallelMode.WEIGHT_DATA) and gpc.get_world_size(ParallelMode.WEIGHT_DATA) > 1:
         sync_moe_param = (
             gpc.is_initialized(ParallelMode.EXPERT_DATA) and gpc.get_world_size(ParallelMode.EXPERT_DATA) > 1
         )
@@ -28,8 +32,8 @@ def sync_model_param(model):
                 ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA)
                 dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.EXPERT_DATA))
             else:
-                ranks = gpc.get_ranks_in_group(ParallelMode.DATA)
-                dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.DATA))
+                ranks = gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA)
+                dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.WEIGHT_DATA))
 
 
 def sync_model_param_within_tp(model):
diff --git a/train.py b/train.py
index 789094ac..996d7465 100644
--- a/train.py
+++ b/train.py
@@ -220,7 +220,7 @@ def main(args):
         # start iterating the train data and begin training
         for batch_count in range(train_state.batch_count, total_steps):
             empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval)
-            torch.cuda.memory._record_memory_history()
+            # torch.cuda.memory._record_memory_history()
             start_time = time.time()
             timer("one-batch").start()
 
@@ -327,7 +327,7 @@ def main(args):
             if batch_count % 2 == 0:
                 prof.step()
 
-            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+            # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
             torch.cuda.reset_peak_memory_stats()
 
     ckpt_manager.wait_async_upload_finish()
@@ -355,4 +355,4 @@ def main(args):
                 alert_address=gpc.config.monitor.alert.feishu_alert_address, excp_info=traceback.format_exc()
             )
 
-            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
+            # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")

From cab9abd2589bd782af2ecf7b9e68967087ceb937 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 29 Nov 2023 16:59:01 +0800
Subject: [PATCH 078/153] fix(training_internlm.py): fix loss accuracy(optim
 init and seed set)

---
 internlm/core/context/parallel_context.py     |  6 ++++++
 internlm/model/overlap_handler.py             |  2 +-
 internlm/solver/optimizer/__init__.py         |  1 +
 .../solver/optimizer/hybrid_zero_optim2.py    |  2 +-
 internlm/train/training_internlm.py           | 16 +++++++++++++--
 internlm/utils/parallel.py                    | 20 +++++++++++++++++++
 6 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 8d34f608..e25b3de6 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -596,12 +596,18 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False):
             tp_seed = seed + tp_rank + pipeline_offset * 1024
             add_seed(ParallelMode.TENSOR, tp_seed)
 
+        if self.is_initialized(ParallelMode.WEIGHT):
+            wp_rank = self.get_local_rank(ParallelMode.WEIGHT)
+            wp_seed = seed + wp_rank + pipeline_offset * 1024
+            add_seed(ParallelMode.WEIGHT, wp_seed)
+
         # we do not set the random state mode to ParallelMode.DATA until model is built (instead, we use a dummy mode
         # during model construction), this is because the random state will be different in different tensor parallel
         # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform
         # additional random operations during the RowParallelLinear module building process.
         # set_mode(ParallelMode.DUMMY)
         set_mode(ParallelMode.TENSOR)
+        set_mode(ParallelMode.WEIGHT)
 
         seeds = get_seeds()
         seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()])
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index b2131a74..3d02e5d4 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -40,7 +40,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
         self.head = []
         self.embedding = []
         self.model_checkpoint = gpc.config.model.checkpoint
-        self.enable_memory_pool = gpc.config.parallel["tensor"].get("memory_pool", False)
+        self.enable_memory_pool = gpc.config.parallel["weight"].get("memory_pool", False)
         self.is_forward = True
 
         self.reduce_scatter_handlers = {}
diff --git a/internlm/solver/optimizer/__init__.py b/internlm/solver/optimizer/__init__.py
index 7c6a1c64..309f2295 100644
--- a/internlm/solver/optimizer/__init__.py
+++ b/internlm/solver/optimizer/__init__.py
@@ -3,5 +3,6 @@
 
 from .fsdp_optimizer import FSDPadaptOptimizer
 from .hybrid_zero_optim import HybridZeroOptimizer, reload_zero_fp32_buff
+from .hybrid_zero_optim2 import HybridZeroOptimizer2
 
 __all__ = ["FSDPadaptOptimizer", "HybridZeroOptimizer", "reload_zero_fp32_buff"]
diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py
index 7ab9823b..e2b3995a 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim2.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim2.py
@@ -131,7 +131,7 @@ def __init__(
         if self._overlap_sync_param:
             assert self._param_bcast_sync_handler is not None
 
-        if gpc.config.parallel.weight >= 1 and gpc.config.parallel["weight"]["overlap"] is True:
+        if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True:
             self._fstp_handler = gpc.fstp_handler
         else:
             self._fstp_handler = None
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 96548f71..f0421cd4 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -48,7 +48,7 @@
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.solver.beta2_scheduler import Beta2Scheduler
 from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
-from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
+from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer, HybridZeroOptimizer2
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
 from internlm.utils.common import DummyProfile
@@ -58,6 +58,7 @@
     set_model_params_layer_name,
     sync_model_param,
     sync_model_param_within_tp,
+    sync_model_param_within_wp,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.timeout import llm_timeout
@@ -106,6 +107,8 @@ def initialize_model():
     # the same across tensor parallelism.
     sync_model_param_within_tp(model)
 
+    sync_model_param_within_wp(model)
+
     # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
     # state in the same dp group are all the same.
     set_mode(ParallelMode.WEIGHT_DATA)
@@ -182,7 +185,14 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
         eps=adam_cfg.adam_eps,
     )
 
-    if not gpc.config.parallel.zero1.fsdp:
+    if gpc.config.parallel.weight.size > 1:
+        optimizer = HybridZeroOptimizer2(
+            naive_optimizer,
+            grad_scal_cfg=gpc.config.grad_scaler,
+            zero_cfg=gpc.config.hybrid_zero_optimizer,
+            param_bcast_sync_handler=param_bcast_sync_handler,
+        )
+    elif not gpc.config.parallel.zero1.fsdp:
         optimizer = HybridZeroOptimizer(
             naive_optimizer,
             grad_scal_cfg=gpc.config.grad_scaler,
@@ -608,6 +618,8 @@ def record_current_batch_training_metrics(
             tflops_list_2.append(tflops_2)
         if batch_count == gpc.config.data.total_steps - 1:
             print(tgs_list, flush=True)
+            if len(tgs_list) <= 0:
+                return
             avg_tgs = sum(tgs_list) / len(tgs_list)
             for tgs in tgs_list.copy():
                 if abs(tgs - avg_tgs) > 400:
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index f6e72cff..3399491c 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -56,6 +56,26 @@ def sync_model_param_within_tp(model):
                 dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
 
 
+def sync_model_param_within_wp(model):
+    r"""This function is changed from colossalai, which is ``sync_model_param``.
+
+    We modified this function to make sure it only sync parameters within tensor parallelism
+    but they are not splitted by tensor parallelism.
+    This function is used to make sure parameters that are not splitted by tensor parallelism
+    are the same across each tensor parallelism.
+    For example, parameters like RMSNorm, LayerNorm...
+
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
+    """
+    parallel_mode = ParallelMode.WEIGHT
+    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
+        for param in model.parameters():
+            if not is_weight_parallel_parameter(param):
+                ranks = gpc.get_ranks_in_group(parallel_mode)
+                dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
+
+
 def get_parallel_log_file_name():
     if gpc.is_rank_for_log():
         fn_prefix = "main_"  # Indicates a rank with more output information

From d3ee3eff165e59b237740df005bee031a1315a05 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 30 Nov 2023 16:08:46 +0800
Subject: [PATCH 079/153] fix(model): reset embedding and head

---
 internlm/model/embedding.py         | 54 +++--------------------------
 internlm/model/linear.py            | 23 ------------
 internlm/model/modeling_internlm.py | 15 ++++----
 internlm/model/overlap_handler.py   | 27 +++------------
 internlm/train/training_internlm.py | 28 +++++++++++++--
 5 files changed, 41 insertions(+), 106 deletions(-)

diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py
index ad6823b6..4be47d64 100644
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@@ -17,52 +17,6 @@
 from .utils import gather_forward_split_backward, split_forward_gather_backward
 
 
-# class Embedding1D(nn.Module):
-#     """
-#     1D Embedding.
-
-#     Args:
-#         num_embeddings (int): The size of vocab.
-#         embedding_dim (int): The dimention of model.
-#         padding_idx (int): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
-#                             therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
-#                             i.e. it remains as a fixed "pad". None by default.
-#         dtype (Optional[torch.dtype]): Data type None by default.
-
-#     """
-
-#     def __init__(
-#         self,
-#         num_embeddings: int,
-#         embedding_dim: int,
-#         *args,
-#         padding_idx: int = None,
-#         dtype: torch.dtype = None,
-#         **kwargs,
-#     ):
-#         super().__init__()
-
-#         self.num_embeddings = num_embeddings
-#         self.embed_dim = embedding_dim
-#         embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size
-
-#         self.padding_idx = padding_idx
-#         self.embed_args = args
-#         self.embed_kwargs = kwargs
-
-#         self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype))
-
-#     def forward(self, input_: Tensor) -> Tensor:
-#         output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
-
-#         output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
-
-#         if gpc.config.parallel.sequence_parallel:
-#             output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
-
-#         return output
-
-
 class Embedding1D(nn.Module):
     """
     1D Embedding.
@@ -99,10 +53,12 @@ def __init__(
         self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype))
 
     def forward(self, input_: Tensor) -> Tensor:
-        input_ = split_forward_gather_backward(input_, ParallelMode.SEQUENCE, dim=1)
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+
+        output = gather_forward_split_backward(output_parallel, ParallelMode.WEIGHT, dim=-1)
 
-        weight = gather_forward_split_backward(self.weight, ParallelMode.WEIGHT, dim=-1)
-        output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+        if gpc.config.parallel.sequence > 1:
+            output = split_forward_gather_backward(output, ParallelMode.SEQUENCE, dim=1)
 
         return output
 
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 0948ee9c..b92b2ee5 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -77,29 +77,6 @@ def forward(self, input, gather_dim=0):  # pylint: disable=W0622
         )
 
 
-class FSTPScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
-    """
-    ScaleColumnParallelLinear in flash implementation.
-    """
-
-    def forward(self, input, gather_dim=0):  # pylint: disable=W0622
-        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
-        # we do an all_gather of x before doing the matmul.
-        # If not, then the input is already gathered.
-        if self.weight_scale != 1:
-            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
-        else:
-            weight = self.weight
-        return fstp_fused_dense_func(
-            input,
-            weight,
-            self.bias,
-            process_group=self.process_group,
-            module=self,
-            handler=gpc.fstp_handler,
-        )
-
-
 class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear):
     """
     ScaleColumnParallelLinear in megatron implementation.
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 4cb20999..1797f677 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -17,7 +17,6 @@
     MegatronScaleColumnParallelLinear,
     RewardModelLinear,
     ScaleColumnParallelLinear,
-    FSTPScaleColumnParallelLinear,
     get_mlp_cls,
 )
 from internlm.model.multi_head_attention import MHA
@@ -32,6 +31,7 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.registry import MODEL_INITIALIZER
 
+
 MODEL_TYPE = "INTERNLM"
 
 logger = get_logger(__file__)
@@ -316,12 +316,11 @@ def __init__(
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            # head_cls = (
-            #     ScaleColumnParallelLinear
-            #     if self.sp_mode in ["flash-attn", "none", "intern"]
-            #     else MegatronScaleColumnParallelLinear
-            # )
-            head_cls = FSTPScaleColumnParallelLinear
+            head_cls = (
+                ScaleColumnParallelLinear
+                if self.sp_mode in ["flash-attn", "none", "intern"]
+                else MegatronScaleColumnParallelLinear
+            )
         if first:
             if embed_split_hidden:
                 self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
@@ -442,8 +441,6 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             else:  # Training
                 hidden_states = self.head(hidden_states, gather_dim=0)
 
-        hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.SEQUENCE, dim=0)
-
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
         return hidden_states
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 3d02e5d4..086947f3 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -11,7 +11,7 @@
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.scheduler import SchedulerHook
 from internlm.model.embedding import Embedding1D
-from internlm.model.linear import FSTPLinear, FSTPScaleColumnParallelLinear
+from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
 from internlm.model.utils import (
     all_gather_raw,
     all_gather_raw_bias_memory_pool,
@@ -55,11 +55,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non
                 _chunk = _chunk.model
 
             for _chunk_name, children in _chunk.named_children():
-                if isinstance(children, FSTPScaleColumnParallelLinear):
+                if isinstance(children, ScaleColumnParallelLinear):
                     setattr(children, "_fstp_name", "head")
-                    setattr(children.weight, "_fstp_reduce_scatter_str", f"head.weight")
-                    if children.bias is not None:
-                        setattr(children.bias, "_fstp_reduce_scatter_str", f"head.bias")
                     self.head.append(children)
                 elif isinstance(children, Embedding1D):
                     self.embedding.append(children)
@@ -168,7 +165,7 @@ def _get_bias_from_memory_pool(self, module: nn.Module):
         return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
 
     def get_weight_all_gather(self, module):
-        if self.enable_memory_pool and getattr(module, "_fstp_name") != "head":
+        if self.enable_memory_pool:
             return self._get_weight_from_memory_pool(module)
         else:
             return self.weight_global_output[module]
@@ -205,7 +202,7 @@ def release_reduce_scatter_memory(self, key, index):
         self.reduce_scatter_memory_pool[key][index].idle = True
 
     def _all_gather_module_weight(self, module):
-        if self.enable_memory_pool and getattr(module, "_fstp_name") != "head":
+        if self.enable_memory_pool:
             if module.bias is not None:
                 bias_handle = all_gather_raw_bias_memory_pool(
                     module.bias,
@@ -323,16 +320,6 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
             _clear_handle(module)
             _clear_weight(module)
 
-        def _pre_hook_for_head(module: nn.Module, inputs: Any):  # pylint: disable=W0613
-            if module not in self.weight_global_handle:
-                self._all_gather_module_weight(module)
-
-            _wait_handle(module)
-
-        def _post_hook_for_head(module, grad_input, grad_output):  # pylint: disable=W0613
-            _clear_handle(module)
-            _clear_weight(module)
-
         # register forward hooks
         # 1. register post_forward_hook @embedding module to prefetch for block 0
         # 2. register pre_forward_hook @out_proj module to prefetch for next block,
@@ -368,12 +355,6 @@ def _post_hook_for_head(module, grad_input, grad_output):  # pylint: disable=W06
                 module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
                 module.register_full_backward_hook(_post_backward_hook_for_module)
 
-        for head in self.head:
-            head.register_forward_pre_hook(_pre_hook_for_head)
-            head.register_full_backward_pre_hook(_pre_hook_for_head)
-            head.register_forward_hook(_post_hook_for_head)
-            head.register_full_backward_hook(_post_hook_for_head)
-
 
 class FSTPOverlapSchedulerHook(SchedulerHook):
     """
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index f0421cd4..9658baec 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -390,6 +390,30 @@ def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: Trai
     return batch, train_iter
 
 
+# def initialize_llm_profile(profiling: bool = False, start_time: str = None):
+#     """Initialize and return the profiler context manager instance."""
+
+#     if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+#         llm_profile = torch.profiler.profile
+#         logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
+#     else:
+#         llm_profile = DummyProfile
+
+#     return llm_profile(
+#         activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+#         schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
+#         on_trace_ready=torch.profiler.tensorboard_trace_handler(
+#             f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
+#             + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+#             + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
+#             + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
+#         ),
+#         with_stack=True,
+#         with_modules=True,
+#         profile_memory=True,
+#     )
+
+
 def initialize_llm_profile(profiling: bool = False, start_time: str = None):
     """Initialize and return the profiler context manager instance."""
 
@@ -405,8 +429,8 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
         on_trace_ready=torch.profiler.tensorboard_trace_handler(
             f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
             + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
-            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
-            + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
+            + f"wp{gpc.get_local_rank(ParallelMode.WEIGHT)}_"
+            + f"sp{gpc.get_local_rank(ParallelMode.SEQUENCE)}",
         ),
         with_stack=True,
         with_modules=True,

From 6cd271c3bc6625ea119533911857878d2af18436 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 1 Dec 2023 10:52:01 +0800
Subject: [PATCH 080/153] fix(model): fix process group error

---
 internlm/model/embedding.py                   |  8 ++++++--
 internlm/model/loss.py                        |  2 +-
 internlm/model/modeling_internlm.py           | 20 ++++++++++++++++++-
 internlm/model/multi_head_attention.py        |  9 +++++++++
 .../solver/optimizer/hybrid_zero_optim2.py    | 13 +++++++++---
 internlm/train/training_internlm.py           |  1 +
 sort_log.py                                   | 17 ++++++++++++++++
 7 files changed, 63 insertions(+), 7 deletions(-)
 create mode 100644 sort_log.py

diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py
index 4be47d64..225a5f16 100644
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@@ -44,7 +44,7 @@ def __init__(
 
         self.num_embeddings = num_embeddings
         self.embed_dim = embedding_dim
-        embed_dim_per_partition = embedding_dim // gpc.weight_parallel_size
+        embed_dim_per_partition = embedding_dim // gpc.sequence_parallel_size
 
         self.padding_idx = padding_idx
         self.embed_args = args
@@ -55,10 +55,14 @@ def __init__(
     def forward(self, input_: Tensor) -> Tensor:
         output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
 
-        output = gather_forward_split_backward(output_parallel, ParallelMode.WEIGHT, dim=-1)
+        output = gather_forward_split_backward(output_parallel, ParallelMode.SEQUENCE, dim=-1)
 
         if gpc.config.parallel.sequence > 1:
             output = split_forward_gather_backward(output, ParallelMode.SEQUENCE, dim=1)
+            # print(
+            #     f"ht debug embed: rank:{gpc.get_global_rank()} output.shape:{output.shape} output:{output}",
+            #     flush=True,
+            # )
 
         return output
 
diff --git a/internlm/model/loss.py b/internlm/model/loss.py
index ac92b4b9..a634d2c7 100644
--- a/internlm/model/loss.py
+++ b/internlm/model/loss.py
@@ -28,7 +28,7 @@ def __init__(self, parallel_output=True, label_smoothing=0):
             self.loss_fn = FlashCrossEntropyLoss(
                 reduction="mean",
                 inplace_backward=True,
-                process_group=gpc.get_group(ParallelMode.TENSOR),
+                process_group=gpc.get_group(ParallelMode.SEQUENCE),
                 label_smoothing=label_smoothing,
             )  # The loss in this place is bound to the gather_output initialized by VocabParallelClassifier1D
         else:
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 1797f677..a937a3f4 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -149,9 +149,13 @@ def __init__(
         for param in self.norm1.parameters():
             if gpc.config.parallel.sequence_parallel is True:
                 setattr(param, IS_SEQUENCE_PARALLEL, True)
+            if gpc.config.parallel.weight.size > 1:
+                setattr(param, IS_SEQUENCE_PARALLEL, True)
         for param in self.norm2.parameters():
             if gpc.config.parallel.sequence_parallel is True:
                 setattr(param, IS_SEQUENCE_PARALLEL, True)
+            if gpc.config.parallel.weight.size > 1:
+                setattr(param, IS_SEQUENCE_PARALLEL, True)
 
         self.dropout2 = nn.Dropout(drop_rate)
         self.use_swiglu = use_swiglu
@@ -240,7 +244,14 @@ def _dropout_and_norm_ffn(_residual, _hidden_states):
         if self.residual_in_fp32:
             residual = residual.to(torch.float32)
 
+        # print(
+        #     f"ht debug mlp rank:{gpc.get_global_rank()} input.shape:{hidden_states.shape} input:{hidden_states}",
+        #     flush=True,
+        # )
         hidden_states = self.mlp(hidden_states)
+        # print(
+        #     f"ht debug mlp rank:{gpc.get_global_rank()} out.shape:{hidden_states.shape} out:{hidden_states}", flush=True
+        # )
 
         return hidden_states + residual
 
@@ -376,7 +387,7 @@ def __init__(
             self.head = head_cls(
                 in_features=hidden_size,
                 out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
-                process_group=gpc.get_group(ParallelMode.WEIGHT),
+                process_group=gpc.get_group(ParallelMode.SEQUENCE),
                 bias=False,
                 device=device,
                 dtype=dtype,
@@ -391,6 +402,8 @@ def __init__(
             for param in self.norm.parameters():
                 if gpc.config.parallel.sequence_parallel is True:
                     setattr(param, IS_SEQUENCE_PARALLEL, True)
+                if gpc.config.parallel.weight.size > 1:
+                    setattr(param, IS_SEQUENCE_PARALLEL, True)
 
         self.parallel_output = parallel_output
 
@@ -441,6 +454,11 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             else:  # Training
                 hidden_states = self.head(hidden_states, gather_dim=0)
 
+            # print(
+            #     f"ht debug head rank:{gpc.get_global_rank()} hidden_states.shape:{hidden_states.shape} hidden_states:{hidden_states}",
+            #     flush=True,
+            # )
+
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
         return hidden_states
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index d06cd967..77b05c6f 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -445,9 +445,11 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
                 split x during sequence parallel, we split the batch * seqlen dimension
                 (in case batch is small).
         """
+        # print(f"ht debug mha rank:{gpc.get_global_rank()} wqkv.shape:{self.Wqkv.weight.shape} wqkv:{self.Wqkv.weight}")
         qkv = self.Wqkv(x)  # total x hsz'
         qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim)  # total x 3 x n_head x d
         qkv = self.rotary_emb(qkv, **kwargs)
+        # print(f"ht debug mha rank:{gpc.get_global_rank()} qkv.shape:{qkv.shape} qkv:{qkv}", flush=True)
         kwargs.pop("indexes")
         if inference_params is None:
             if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
@@ -462,5 +464,12 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
             raise RuntimeError("Not support this right now")
 
         context = rearrange(context, "b h d -> b (h d)")  # recover the shape
+        # print(f"ht debug mha rank:{gpc.get_global_rank()} context.shape:{context.shape} context:{context}")
+        # print(
+        #     f"ht debug mha rank:{gpc.get_global_rank()} out_proj.shape:{self.out_proj.weight.shape} out_proj:{self.out_proj.weight}"
+        # )
         out = self.out_proj(context)
+
+        # print(f"ht debug mha rank:{gpc.get_global_rank()} out.shape:{out.shape} out:{out}")
+
         return out
diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py
index e2b3995a..fd34b265 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim2.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim2.py
@@ -131,7 +131,7 @@ def __init__(
         if self._overlap_sync_param:
             assert self._param_bcast_sync_handler is not None
 
-        if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True:
+        if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True:
             self._fstp_handler = gpc.fstp_handler
         else:
             self._fstp_handler = None
@@ -314,7 +314,7 @@ def reduction_sp_func():
                             param.grad,
                             dtype=None,
                             dst_rank=reduce_rank,
-                            parallel_mode=ParallelMode.GLOBAL,
+                            parallel_mode=ParallelMode.WEIGHT,
                         )
                         handle.wait()
 
@@ -341,8 +341,15 @@ def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
 
                     # if sequence_parallel is True,
                     # the grad of norm should be all-reduce across the tp process group
+                    # if (
+                    #     gpc.config.parallel.sequence_parallel is True
+                    #     and hasattr(param, IS_SEQUENCE_PARALLEL)
+                    #     and getattr(param, IS_SEQUENCE_PARALLEL) is True
+                    # ):
+                    #     accum_grad_obj.register_hook(reduce_grad_hook_sp)
+
                     if (
-                        gpc.config.parallel.sequence_parallel is True
+                        gpc.config.parallel.weight.size > 1
                         and hasattr(param, IS_SEQUENCE_PARALLEL)
                         and getattr(param, IS_SEQUENCE_PARALLEL) is True
                     ):
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 9658baec..d48d99c3 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -192,6 +192,7 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
             zero_cfg=gpc.config.hybrid_zero_optimizer,
             param_bcast_sync_handler=param_bcast_sync_handler,
         )
+        logger.info("use HybridZeroOptimizer2 for new partition strategy...")
     elif not gpc.config.parallel.zero1.fsdp:
         optimizer = HybridZeroOptimizer(
             naive_optimizer,
diff --git a/sort_log.py b/sort_log.py
new file mode 100644
index 00000000..786c2282
--- /dev/null
+++ b/sort_log.py
@@ -0,0 +1,17 @@
+import re
+
+# 读取日志信息
+with open("ht.log", "r") as file:
+    log_content = file.read()
+
+# 使用正则表达式提取以 "ht debug" 开头、以 "dtype=***" 结尾的日志信息块
+log_blocks = re.findall(r"ht debug.*?device=[^\n]*", log_content, re.DOTALL)
+
+# 将日志信息块按照 "rank:" 后的整数值进行正序排序
+sorted_log_blocks = sorted(log_blocks, key=lambda x: int(re.search(r"rank:(\d+)", x).group(1)))
+
+# 将排序后的日志信息块写入新的文件
+with open("sorted.log", "w") as file:
+    file.write("\n\n".join(sorted_log_blocks))
+
+print("日志信息块已按照 rank: 后的整数值进行正序排序，并保存在 sorted_log_blocks.txt 文件中。")

From 0817b8cf204aef1c0f0b3ec74cbc59acbf3f93a6 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 1 Dec 2023 15:18:49 +0800
Subject: [PATCH 081/153] fix(model): fix FSTP linear Torch process group

---
 internlm/model/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 89980c07..04fa0efe 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -657,7 +657,7 @@ def backward(ctx, grad_output, *args):
         batch_dim = batch_shape.numel()
         grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
 
-        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        world_size = gpc.get_world_size(ParallelMode.WEIGHT)
         if world_size > 1:
             if overlap_handler is not None:
                 total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)

From 1b7d2dc455ef2438ba084e97cc0b9f4ec658c965 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 7 Dec 2023 10:25:32 +0800
Subject: [PATCH 082/153] fix(overlap_handler.py): release module post backward
 when model ckpt is

---
 internlm/core/naive_amp.py        | 2 +-
 internlm/model/overlap_handler.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/internlm/core/naive_amp.py b/internlm/core/naive_amp.py
index 9bead52f..fb04759b 100644
--- a/internlm/core/naive_amp.py
+++ b/internlm/core/naive_amp.py
@@ -81,7 +81,7 @@ def _convert_to_fp16(self, input_: Any):
 
     def _convert_to_fp32(self, input_: Any):
         """Converts the input to fp32 if it is a Tensor of dtype float16."""
-        if isinstance(input_, Tensor) and input_.dtype == torch.float16:
+        if isinstance(input_, Tensor) and input_.dtype in (torch.float16, torch.bfloat16):
             input_ = input_.float()
         return input_
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 086947f3..c81b09d0 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -353,7 +353,9 @@ def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint:
 
             for module in self.fstp_modules:
                 module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
-                module.register_full_backward_hook(_post_backward_hook_for_module)
+
+        for module in self.fstp_modules:
+            module.register_full_backward_hook(_post_backward_hook_for_module)
 
 
 class FSTPOverlapSchedulerHook(SchedulerHook):

From fd5a144724a19b00636178edc2792bc30b6d7ef7 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 11 Dec 2023 17:17:35 +0800
Subject: [PATCH 083/153] feat(model): embedding and head use sp group and
 refactor parameter group

---
 internlm/core/context/__init__.py             |  4 +-
 internlm/core/context/parallel_context.py     |  4 +-
 internlm/model/modeling_internlm.py           | 58 +++++++++------
 internlm/model/multi_head_attention.py        | 12 +--
 .../solver/optimizer/hybrid_zero_optim2.py    | 74 +++++++++++++------
 internlm/solver/optimizer/store.py            |  4 +-
 internlm/solver/optimizer/utils.py            | 45 +++++++----
 internlm/train/training_internlm.py           |  4 +-
 internlm/train/utils.py                       | 67 ++++++++++++++++-
 internlm/utils/parallel.py                    | 39 ++++++----
 10 files changed, 224 insertions(+), 87 deletions(-)

diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py
index e17b4ba3..f62d6a90 100644
--- a/internlm/core/context/__init__.py
+++ b/internlm/core/context/__init__.py
@@ -1,7 +1,9 @@
 from .parallel_context import (
     IS_SEQUENCE_PARALLEL,
     IS_TENSOR_PARALLEL,
-    IS_WEIGHT_PARALLEL,
+    IS_REPLICA_ZERO_PARALLEL,
+    IS_SEQUENCE_DATA_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
     Config,
     ParallelContext,
     global_context,
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index e25b3de6..c2fc574d 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -26,7 +26,9 @@
 
 IS_TENSOR_PARALLEL = "is_tensor_parallel"
 IS_SEQUENCE_PARALLEL = "is_sequence_parallel"
-IS_WEIGHT_PARALLEL = "is_weight_parallel"
+IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel"
+IS_SEQUENCE_DATA_PARALLEL = "is_sequence_data_parallel"
+IS_WEIGHT_ZERO_PARALLEL = "is_weight_zero_parallel"
 
 logger = get_logger(__file__)
 
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index a937a3f4..400ad273 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -9,7 +9,14 @@
 from flash_attn.modules.mlp import ParallelFusedMLP
 from torch import nn
 
-from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode
+from internlm.core.context import (
+    IS_SEQUENCE_PARALLEL,
+    IS_TENSOR_PARALLEL,
+    IS_REPLICA_ZERO_PARALLEL,
+    IS_SEQUENCE_DATA_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
+    ParallelMode,
+)
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
 from internlm.model.embedding import Embedding1D
@@ -142,20 +149,22 @@ def __init__(
                 dtype=dtype,
             )
         for _, param in self.mlp.named_parameters():
-            if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                setattr(param, IS_TENSOR_PARALLEL, True)
+            # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+            #     setattr(param, IS_TENSOR_PARALLEL, True)
             if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
-                setattr(param, IS_WEIGHT_PARALLEL, True)
+                setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
         for param in self.norm1.parameters():
-            if gpc.config.parallel.sequence_parallel is True:
-                setattr(param, IS_SEQUENCE_PARALLEL, True)
-            if gpc.config.parallel.weight.size > 1:
-                setattr(param, IS_SEQUENCE_PARALLEL, True)
+            # if gpc.config.parallel.sequence_parallel is True:
+            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
+            # if gpc.config.parallel.weight.size > 1:
+            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
+            setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
         for param in self.norm2.parameters():
-            if gpc.config.parallel.sequence_parallel is True:
-                setattr(param, IS_SEQUENCE_PARALLEL, True)
-            if gpc.config.parallel.weight.size > 1:
-                setattr(param, IS_SEQUENCE_PARALLEL, True)
+            # if gpc.config.parallel.sequence_parallel is True:
+            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
+            # if gpc.config.parallel.weight.size > 1:
+            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
+            setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
         self.dropout2 = nn.Dropout(drop_rate)
         self.use_swiglu = use_swiglu
@@ -348,10 +357,10 @@ def __init__(
                 )
             for _, param in self.embedding.named_parameters():
                 normal_(std=0.0052)(param)
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
-                if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
-                    setattr(param, IS_WEIGHT_PARALLEL, True)
+                # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                #     setattr(param, IS_TENSOR_PARALLEL, True)
+                if gpc.get_world_size(ParallelMode.SEQUENCE) > 1:
+                    setattr(param, IS_SEQUENCE_DATA_PARALLEL, True)
         self.embed_grad_scale = embed_grad_scale
         self.blocks = nn.ModuleList(
             [
@@ -395,15 +404,16 @@ def __init__(
             )
             for _, param in self.head.named_parameters():
                 normal_(std=0.0052)(param)
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
-                if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
-                    setattr(param, IS_WEIGHT_PARALLEL, True)
+                # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                #     setattr(param, IS_TENSOR_PARALLEL, True)
+                if gpc.get_world_size(ParallelMode.SEQUENCE) > 1:
+                    setattr(param, IS_SEQUENCE_DATA_PARALLEL, True)
             for param in self.norm.parameters():
-                if gpc.config.parallel.sequence_parallel is True:
-                    setattr(param, IS_SEQUENCE_PARALLEL, True)
-                if gpc.config.parallel.weight.size > 1:
-                    setattr(param, IS_SEQUENCE_PARALLEL, True)
+                # if gpc.config.parallel.sequence_parallel is True:
+                #     setattr(param, IS_SEQUENCE_PARALLEL, True)
+                # if gpc.config.parallel.weight.size > 1:
+                #     setattr(param, IS_SEQUENCE_PARALLEL, True)
+                setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
         self.parallel_output = parallel_output
 
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 77b05c6f..5d9e0a40 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -10,7 +10,7 @@
 import torch.nn.functional as F
 from einops import rearrange
 
-from internlm.core.context import IS_WEIGHT_PARALLEL
+from internlm.core.context import IS_WEIGHT_ZERO_PARALLEL
 
 try:
     from flash_attn.flash_attn_interface import flash_attn_unpadded_func
@@ -235,14 +235,14 @@ def __init__(
             **factory_kwargs,
         )
         # need to assign tp attribute so that internlm know it is tensor parallel module
-        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-            for name in ["out_proj", "Wqkv"]:
-                for param in getattr(self, name).parameters():
-                    setattr(param, IS_TENSOR_PARALLEL, True)
+        # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+        #     for name in ["out_proj", "Wqkv"]:
+        #         for param in getattr(self, name).parameters():
+        #             setattr(param, IS_TENSOR_PARALLEL, True)
         if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
             for name in ["out_proj", "Wqkv"]:
                 for param in getattr(self, name).parameters():
-                    setattr(param, IS_WEIGHT_PARALLEL, True)
+                    setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
 
     def forward(self, x, seqlen=None, inference_params=None, **kwargs):
         if kwargs.get("indexes", None) is not None:
diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py
index fd34b265..3bbf2678 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim2.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim2.py
@@ -9,8 +9,9 @@
 import torch.distributed as dist
 from torch.optim import Optimizer
 
-from internlm.core.context import IS_SEQUENCE_PARALLEL, Config, ParallelMode
+from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_REPLICA_ZERO_PARALLEL, Config, ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
     BucketStore,
@@ -150,17 +151,24 @@ def __init__(
 
             # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1
             # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode
-            zero_mode = (
-                ParallelMode.ZERO1
-                if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
-                else ParallelMode.EXPERT_DATA
-            )
+            # zero_mode = (
+            #     ParallelMode.ZERO1
+            #     if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
+            #     else ParallelMode.EXPERT_DATA
+            # )
+            zero_mode = param_group["optimizer_mode"]
+
             self._zero_local_rank.append(gpc.get_local_rank(zero_mode))
             self._zero_world_size.append(gpc.get_world_size(zero_mode))
             # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
             self._broadcast_parallel_mode.append(zero_mode)
-            self._bucket_store.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA))
-            self._accum_grad_buckets.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA))
+
+            grad_reduce_mode = ParallelMode.WEIGHT_DATA
+            if param_group["name"] == "embed_head":
+                grad_reduce_mode = ParallelMode.DATA
+
+            self._bucket_store.append(BucketStore(group_id, grad_reduce_mode))
+            self._accum_grad_buckets.append(BucketStore(group_id, grad_reduce_mode))
 
             # assign parameters to ranks the params in the list are sorted
             params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group)
@@ -184,15 +192,26 @@ def __init__(
                 param.data = param.data.cpu()
 
             # flatten the reordered tensors
-            for rank in range(self._zero_world_size[group_id]):
-                # No flat fp16 buffer is allocated if the process has no parameters.
-                if rank not in self.param_group_no_params_ranks[group_id]:
-                    tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
-                    with torch.no_grad():
-                        flat_tensor = flatten(tensor_list)
-                    flat_tensor = flat_tensor.data.cuda()
-                    self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
-                    sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
+            if param_group["name"] == "embed_head":
+                tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id)
+                with torch.no_grad():
+                    flat_tensor = flatten(tensor_list)
+                flat_tensor = flat_tensor.data.cuda()
+                sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
+                # for rank in range(self._zero_world_size[group_id]):
+                self._param_store.add_flat_fp16_param_by_rank_group(
+                    self._zero_local_rank[group_id], group_id, flat_tensor
+                )
+            else:
+                for rank in range(self._zero_world_size[group_id]):
+                    # No flat fp16 buffer is allocated if the process has no parameters.
+                    if rank not in self.param_group_no_params_ranks[group_id]:
+                        tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                        with torch.no_grad():
+                            flat_tensor = flatten(tensor_list)
+                        flat_tensor = flat_tensor.data.cuda()
+                        self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
+                        sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
 
             # create a copy of fp32 weights of the parameters for which this rank is responsible
             # No flat fp32 buffer is allocated if the process has no parameters.
@@ -222,8 +241,6 @@ def __init__(
         # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled.
         self.skip_grad_reduce = False
 
-        # reduction hook is only used if overlapping communication
-        # if it is stage 1 without overlapping, no hook will be attached
         self._attach_reduction_hook()
 
     @property
@@ -244,6 +261,10 @@ def num_param_groups(self):
 
     def _partition_param_list(self, group_id, param_group):
         no_params_ranks = []
+        if param_group["name"] == "embed_head":
+            params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])]
+            return params_per_rank, set(no_params_ranks)
+
         params_per_rank = [[] for _ in range(self._zero_world_size[group_id])]
         numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])]
         self.params_per_rank_id_dict.append([[] for _ in range(self._zero_world_size[group_id])])
@@ -350,8 +371,8 @@ def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
 
                     if (
                         gpc.config.parallel.weight.size > 1
-                        and hasattr(param, IS_SEQUENCE_PARALLEL)
-                        and getattr(param, IS_SEQUENCE_PARALLEL) is True
+                        and hasattr(param, IS_REPLICA_ZERO_PARALLEL)
+                        and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True
                     ):
                         accum_grad_obj.register_hook(reduce_grad_hook_sp)
 
@@ -382,9 +403,9 @@ def belongs_to_current_rank(self, param) -> bool:
         :return: True if the parameter should be updated by the current rank. Otherwise false.
         :rtype: bool
         """
-        tensor_rank = self._param_store.get_param_rank(param)
+        tensor_ranks = self._param_store.get_param_rank(param)
         group_id = getattr(param, "group_id")
-        return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id])
+        return gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) in tensor_ranks
 
     def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None:
         for _param in bucket.get_param(reduce_rank):
@@ -654,6 +675,11 @@ def step(self, closure=None):
         """
         assert closure is None, "closure is not supported by step()"
 
+        # import pdb
+
+        # if gpc.get_global_rank() == 0:
+        #     pdb.set_trace()
+
         # if not overlapping communication (no reduction hook is attached)
         # we need to manually reduce these gradients
         if not self._overlap_sync_grad:
@@ -859,6 +885,8 @@ def broadcast_params(self):
         handles = []
 
         for group_id in range(self.num_param_groups):
+            if self.param_groups[group_id]["name"] == "embed_head":
+                continue
             for rank in range(self._zero_world_size[group_id]):
                 # The following operations are performed only on the rank to which parameters are assigned.
                 if rank in self.param_group_no_params_ranks[group_id]:
diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py
index f486ccec..c42f1a56 100644
--- a/internlm/solver/optimizer/store.py
+++ b/internlm/solver/optimizer/store.py
@@ -177,8 +177,10 @@ def set_param_to_rank(self, tensor: Tensor, rank: int) -> None:
         :param rank: The rank of which the process is responsible for updating the parameter
         :type rank: int
         """
+        if tensor not in self._fp16_param_to_rank:
+            self._fp16_param_to_rank[tensor] = []
 
-        self._fp16_param_to_rank[tensor] = rank
+        self._fp16_param_to_rank[tensor].append(rank)
 
     def get_param_rank(self, tensor: Tensor) -> int:
         """
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index b2ea0391..223fddf1 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -14,10 +14,20 @@
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.context.parallel_context import (
+    IS_REPLICA_ZERO_PARALLEL,
+    IS_SEQUENCE_DATA_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
+)
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.utils.common import get_current_device, get_tensor_norm, move_norm_to_cuda
 from internlm.utils.logger import get_logger
-from internlm.utils.parallel import is_model_parallel_parameter, is_weight_parallel_parameter
+from internlm.utils.parallel import (
+    is_model_parallel_parameter,
+    is_replica_zero_parallel_parameter,
+    is_sequence_data_parallel_parameter,
+    is_weight_zero_parallel_parameter,
+)
 
 logger = get_logger(__file__)
 
@@ -244,12 +254,14 @@ def append_grad(g, p):
         ):  # if not used in each chunk, such as layernorm
             append_grad(g, p)
         elif (
-            gpc.is_initialized(ParallelMode.WEIGHT)
-            and not is_weight_parallel_parameter(p)
-            and gpc.get_local_rank(ParallelMode.WEIGHT) == 0
-        ):  # if not used in each chunk, such as layernorm
+            is_replica_zero_parallel_parameter(p) and gpc.get_global_rank(ParallelMode.GLOBAL) == 0
+        ):  # if not used in each chunk, such as layernorm IS_REPLICA_ZERO_PARALLEL parameter group
+            append_grad(g, p)
+        elif gpc.is_initialized(ParallelMode.SEQUENCE) and is_sequence_data_parallel_parameter(p):
+            # process all ranks for IS_SEQUENCE_DATA_PARALLEL parameter group
             append_grad(g, p)
-        elif is_weight_parallel_parameter(p):
+        elif gpc.is_initialized(ParallelMode.WEIGHT) and is_weight_zero_parallel_parameter(p):
+            # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group
             append_grad(g, p)
         elif is_model_parallel_parameter(p):
             append_grad(g, p)
@@ -320,16 +332,23 @@ def compute_norm(
             total_norm = total_norm + previous_norm
 
         # Sum across all model-parallel GPUs.
-        if gpc.is_initialized(ParallelMode.WEIGHT):
-            dist.all_reduce(
-                total_norm,
-                op=dist.ReduceOp.SUM,
-                group=gpc.get_group(ParallelMode.WEIGHT),
-            )
+        if hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL):
+            if gpc.is_initialized(ParallelMode.WEIGHT):
+                dist.all_reduce(
+                    total_norm,
+                    op=dist.ReduceOp.SUM,
+                    group=gpc.get_group(ParallelMode.WEIGHT),
+                )
 
         # This is because we use zero1, so we need to use this reduction.
         # TODO: Check zero group to be a subset of dp group.
-        dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
+        if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or (
+            hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL)
+        ):
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
+
+        if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL):
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE))
 
         if torch.is_tensor(total_norm):
             total_norm = total_norm.item()
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index d48d99c3..8d786489 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -58,7 +58,7 @@
     set_model_params_layer_name,
     sync_model_param,
     sync_model_param_within_tp,
-    sync_model_param_within_wp,
+    sync_model_replica_param_group,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.timeout import llm_timeout
@@ -107,7 +107,7 @@ def initialize_model():
     # the same across tensor parallelism.
     sync_model_param_within_tp(model)
 
-    sync_model_param_within_wp(model)
+    sync_model_replica_param_group(model)
 
     # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
     # state in the same dp group are all the same.
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 9096a2a4..0e866eb5 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from internlm.core.context.parallel_context import ParallelMode
+from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL, ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
 
@@ -81,6 +81,69 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
     return tuple(param_groups)
 
 
+def split_params_into_different_groups_for_optimizer_with_new_partition_strategy(
+    param_groups: Tuple[Dict],
+) -> Tuple[Dict]:
+    """Split parameters into different groups for optimizer
+
+    Args:
+        param_groups (Tuple[Dict]): The list of parameter groups to split
+        Input Example:
+        >>> (
+        >>>     {'name': 'default', 'params': [tensor], 'weight_decay' :xxx},
+        >>> )
+
+    Returns:
+        Tuple[Dict]: list of params groups for optimizer
+        Output Example:
+        >>> (
+        >>>     {'name': 'default','params': [tensor],'weight_decay' :xxx},
+        >>>     {'name': 'embed_head', 'params': [tensor],'weight_decay' :xxx},
+        >>> )
+    """
+
+    if isinstance(param_groups, tuple):
+        param_groups = list(param_groups)  # Tuple cannot be modified
+    elif isinstance(param_groups, dict):
+        param_groups = [param_groups]
+    elif not isinstance(param_groups, list):
+        raise ValueError(f"Unknown param group type of {type(param_groups)}")
+
+    # print(f"ht debug params_groups before split total len:{len(param_groups[0]['params'])}", flush=True)
+
+    # create new groups for IS_SEQUENCE_DATA_PARALLEL parameter group
+    new_groups = {}
+    new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
+
+    for pgroup in param_groups:
+        # copy attribute from origin group, we assume the input param_groups only
+        # have one group, so the attribute will not be copyed multiple times.
+        for ori_key in pgroup.keys():
+            if ori_key not in ("name", "params"):
+                for _, group in new_groups.items():
+                    group[ori_key] = pgroup[ori_key]
+        # assign param
+        origin_params = []
+        for param in pgroup["params"]:
+            if hasattr(param, IS_SEQUENCE_DATA_PARALLEL) and getattr(param, IS_SEQUENCE_DATA_PARALLEL) is True:
+                new_groups["embed_head"]["params"].append(param)
+            else:
+                origin_params.append(param)
+
+        # default param group, which is the first group in the param groups
+        pgroup["params"] = origin_params
+        pgroup["optimizer_mode"] = ParallelMode.ZERO1
+
+    # param groups may contain empty groups, such as fp32
+    param_groups.extend(new_groups.values())
+
+    # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True)
+    # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True)
+
+    return tuple(param_groups)
+
+
 def create_param_groups(model, weight_decay):
     parameters = {"params": list(model.parameters()), "name": "default", "weight_decay": weight_decay}
-    return split_params_into_different_groups_for_optimizer(parameters)
+    # return split_params_into_different_groups_for_optimizer(parameters)
+    return split_params_into_different_groups_for_optimizer_with_new_partition_strategy(parameters)
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index 3399491c..2b421b07 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -4,7 +4,13 @@
 import torch.distributed as dist
 from torch import nn
 
-from internlm.core.context import IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode
+from internlm.core.context import (
+    IS_TENSOR_PARALLEL,
+    IS_REPLICA_ZERO_PARALLEL,
+    IS_SEQUENCE_DATA_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
+    ParallelMode,
+)
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 
@@ -13,8 +19,16 @@ def is_model_parallel_parameter(p):
     return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
 
 
-def is_weight_parallel_parameter(p):
-    return hasattr(p, IS_WEIGHT_PARALLEL) and getattr(p, IS_WEIGHT_PARALLEL)
+def is_replica_zero_parallel_parameter(p):
+    return hasattr(p, IS_REPLICA_ZERO_PARALLEL) and getattr(p, IS_REPLICA_ZERO_PARALLEL)
+
+
+def is_sequence_data_parallel_parameter(p):
+    return hasattr(p, IS_SEQUENCE_DATA_PARALLEL) and getattr(p, IS_SEQUENCE_DATA_PARALLEL)
+
+
+def is_weight_zero_parallel_parameter(p):
+    return hasattr(p, IS_WEIGHT_ZERO_PARALLEL) and getattr(p, IS_WEIGHT_ZERO_PARALLEL)
 
 
 def sync_model_param(model):
@@ -56,24 +70,21 @@ def sync_model_param_within_tp(model):
                 dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
 
 
-def sync_model_param_within_wp(model):
+def sync_model_replica_param_group(model):
     r"""This function is changed from colossalai, which is ``sync_model_param``.
 
-    We modified this function to make sure it only sync parameters within tensor parallelism
-    but they are not splitted by tensor parallelism.
-    This function is used to make sure parameters that are not splitted by tensor parallelism
-    are the same across each tensor parallelism.
+    We modified this function to make sure it only sync IS_REPLICA_ZERO_PARALLEL parameters in world size.
+    This function is used to make sure parameters that are not splitted are the same across each rank.
     For example, parameters like RMSNorm, LayerNorm...
 
     Args:
         model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
     """
-    parallel_mode = ParallelMode.WEIGHT
-    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
-        for param in model.parameters():
-            if not is_weight_parallel_parameter(param):
-                ranks = gpc.get_ranks_in_group(parallel_mode)
-                dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
+
+    for param in model.parameters():
+        if is_replica_zero_parallel_parameter(param):
+            ranks = gpc.get_ranks_in_group(ParallelMode.GLOBAL)
+            dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.GLOBAL))
 
 
 def get_parallel_log_file_name():

From ac72710a68c05d2c1f23af44f1d9147dedab034e Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 12 Dec 2023 11:30:08 +0800
Subject: [PATCH 084/153] feat(model): modify grad norm compute func

---
 .../solver/optimizer/hybrid_zero_optim2.py    | 50 +++++++++----------
 internlm/solver/optimizer/utils.py            | 27 ++++++----
 internlm/train/utils.py                       |  6 ++-
 internlm/utils/parallel.py                    | 12 ++++-
 4 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py
index 3bbf2678..fbfa20cd 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim2.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim2.py
@@ -192,26 +192,26 @@ def __init__(
                 param.data = param.data.cpu()
 
             # flatten the reordered tensors
-            if param_group["name"] == "embed_head":
-                tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id)
-                with torch.no_grad():
-                    flat_tensor = flatten(tensor_list)
-                flat_tensor = flat_tensor.data.cuda()
-                sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
-                # for rank in range(self._zero_world_size[group_id]):
-                self._param_store.add_flat_fp16_param_by_rank_group(
-                    self._zero_local_rank[group_id], group_id, flat_tensor
-                )
-            else:
-                for rank in range(self._zero_world_size[group_id]):
-                    # No flat fp16 buffer is allocated if the process has no parameters.
-                    if rank not in self.param_group_no_params_ranks[group_id]:
-                        tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
-                        with torch.no_grad():
-                            flat_tensor = flatten(tensor_list)
-                        flat_tensor = flat_tensor.data.cuda()
-                        self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
-                        sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
+            # if param_group["name"] == "embed_head":
+            #     tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id)
+            #     with torch.no_grad():
+            #         flat_tensor = flatten(tensor_list)
+            #     flat_tensor = flat_tensor.data.cuda()
+            #     sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
+            #     # for rank in range(self._zero_world_size[group_id]):
+            #     self._param_store.add_flat_fp16_param_by_rank_group(
+            #         self._zero_local_rank[group_id], group_id, flat_tensor
+            #     )
+            # else:
+            for rank in range(self._zero_world_size[group_id]):
+                # No flat fp16 buffer is allocated if the process has no parameters.
+                if rank not in self.param_group_no_params_ranks[group_id]:
+                    tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                    with torch.no_grad():
+                        flat_tensor = flatten(tensor_list)
+                    flat_tensor = flat_tensor.data.cuda()
+                    self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
+                    sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
 
             # create a copy of fp32 weights of the parameters for which this rank is responsible
             # No flat fp32 buffer is allocated if the process has no parameters.
@@ -261,9 +261,9 @@ def num_param_groups(self):
 
     def _partition_param_list(self, group_id, param_group):
         no_params_ranks = []
-        if param_group["name"] == "embed_head":
-            params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])]
-            return params_per_rank, set(no_params_ranks)
+        # if param_group["name"] == "embed_head":
+        #     params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])]
+        #     return params_per_rank, set(no_params_ranks)
 
         params_per_rank = [[] for _ in range(self._zero_world_size[group_id])]
         numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])]
@@ -885,8 +885,8 @@ def broadcast_params(self):
         handles = []
 
         for group_id in range(self.num_param_groups):
-            if self.param_groups[group_id]["name"] == "embed_head":
-                continue
+            # if self.param_groups[group_id]["name"] == "embed_head":
+            #     continue
             for rank in range(self._zero_world_size[group_id]):
                 # The following operations are performed only on the rank to which parameters are assigned.
                 if rank in self.param_group_no_params_ranks[group_id]:
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 223fddf1..7e760b85 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -254,13 +254,13 @@ def append_grad(g, p):
         ):  # if not used in each chunk, such as layernorm
             append_grad(g, p)
         elif (
-            is_replica_zero_parallel_parameter(p) and gpc.get_global_rank(ParallelMode.GLOBAL) == 0
+            is_replica_zero_parallel_parameter(p) and gpc.get_local_rank(ParallelMode.WEIGHT) == 0
         ):  # if not used in each chunk, such as layernorm IS_REPLICA_ZERO_PARALLEL parameter group
             append_grad(g, p)
-        elif gpc.is_initialized(ParallelMode.SEQUENCE) and is_sequence_data_parallel_parameter(p):
+        elif is_sequence_data_parallel_parameter(p):
             # process all ranks for IS_SEQUENCE_DATA_PARALLEL parameter group
             append_grad(g, p)
-        elif gpc.is_initialized(ParallelMode.WEIGHT) and is_weight_zero_parallel_parameter(p):
+        elif is_weight_zero_parallel_parameter(p):
             # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group
             append_grad(g, p)
         elif is_model_parallel_parameter(p):
@@ -332,23 +332,28 @@ def compute_norm(
             total_norm = total_norm + previous_norm
 
         # Sum across all model-parallel GPUs.
-        if hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL):
+        if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL):
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE))
+        else:
             if gpc.is_initialized(ParallelMode.WEIGHT):
                 dist.all_reduce(
                     total_norm,
                     op=dist.ReduceOp.SUM,
                     group=gpc.get_group(ParallelMode.WEIGHT),
                 )
+        if gpc.is_initialized(ParallelMode.PIPELINE):
+            dist.all_reduce(
+                total_norm,
+                op=dist.ReduceOp.SUM,
+                group=gpc.get_group(ParallelMode.PIPELINE),
+            )
 
         # This is because we use zero1, so we need to use this reduction.
         # TODO: Check zero group to be a subset of dp group.
-        if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or (
-            hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL)
-        ):
-            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
-
-        if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL):
-            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE))
+        # if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or (
+        #     hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL)
+        # ):
+        dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
 
         if torch.is_tensor(total_norm):
             total_norm = total_norm.item()
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 0e866eb5..382c46d5 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL, ParallelMode
+from internlm.core.context.parallel_context import IS_REPLICA_ZERO_PARALLEL, IS_SEQUENCE_DATA_PARALLEL, ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
 
@@ -114,6 +114,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
     # create new groups for IS_SEQUENCE_DATA_PARALLEL parameter group
     new_groups = {}
     new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
+    # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1}
 
     for pgroup in param_groups:
         # copy attribute from origin group, we assume the input param_groups only
@@ -127,6 +128,8 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         for param in pgroup["params"]:
             if hasattr(param, IS_SEQUENCE_DATA_PARALLEL) and getattr(param, IS_SEQUENCE_DATA_PARALLEL) is True:
                 new_groups["embed_head"]["params"].append(param)
+            # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True:
+            #     new_groups["layer_norm"]["params"].append(param)
             else:
                 origin_params.append(param)
 
@@ -139,6 +142,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
 
     # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True)
     # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True)
+    # print(f"ht debug params_groups after split layer_norm len:{len(param_groups[2]['params'])}", flush=True)
 
     return tuple(param_groups)
 
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index 2b421b07..966332a1 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -24,11 +24,19 @@ def is_replica_zero_parallel_parameter(p):
 
 
 def is_sequence_data_parallel_parameter(p):
-    return hasattr(p, IS_SEQUENCE_DATA_PARALLEL) and getattr(p, IS_SEQUENCE_DATA_PARALLEL)
+    return (
+        gpc.is_initialized(ParallelMode.SEQUENCE)
+        and hasattr(p, IS_SEQUENCE_DATA_PARALLEL)
+        and getattr(p, IS_SEQUENCE_DATA_PARALLEL)
+    )
 
 
 def is_weight_zero_parallel_parameter(p):
-    return hasattr(p, IS_WEIGHT_ZERO_PARALLEL) and getattr(p, IS_WEIGHT_ZERO_PARALLEL)
+    return (
+        gpc.is_initialized(ParallelMode.WEIGHT)
+        and hasattr(p, IS_WEIGHT_ZERO_PARALLEL)
+        and getattr(p, IS_WEIGHT_ZERO_PARALLEL)
+    )
 
 
 def sync_model_param(model):

From 76be8c26534f1cf18e3670f76d3f7053d3037fd8 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 14 Dec 2023 14:53:40 +0800
Subject: [PATCH 085/153] fix(model/utils.py): fix fstp linear reduce scatter
 sum->avg

---
 internlm/model/utils.py | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 04fa0efe..a4fe3378 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -162,19 +162,23 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
     return grad_weight, grad_bias
 
 
-def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+def reduce_scatter_raw(
+    input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False
+):
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
     output = torch.empty(
         input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
     ).contiguous()
     handle = torch.distributed.reduce_scatter_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
+        output, input_.contiguous(), op=op, group=process_group, async_op=async_op
     )
     return output, handle
 
 
-def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+def reduce_scatter_raw_memory_pool(
+    input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False
+):
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
     if gpc.fstp_handler.enable_memory_pool:
@@ -185,7 +189,7 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup,
             input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
         ).contiguous()
     handle = torch.distributed.reduce_scatter_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
+        output, input_.contiguous(), op=op, group=process_group, async_op=async_op
     )
     return output, handle
 
@@ -575,7 +579,7 @@ def backward(ctx, grad_output, *args):
             if world_size > 1:
                 if overlap_handler is not None:
                     grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
-                        grad_weight, process_group, async_op=True
+                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
                     )
                     assert hasattr(weight, "_fstp_reduce_scatter_str")
                     overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (
@@ -592,7 +596,7 @@ def backward(ctx, grad_output, *args):
                     )
                     if grad_bias is not None:
                         grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(
-                            grad_bias, process_group, async_op=True
+                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
                         )
                         assert hasattr(bias, "_fstp_reduce_scatter_str")
                         overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (
@@ -608,9 +612,13 @@ def backward(ctx, grad_output, *args):
                             device=grad_bias.device,
                         )
                 else:
-                    grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                    grad_weight, handle_grad_weight = reduce_scatter_raw(
+                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
+                    )
                     if grad_bias is not None:
-                        grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                        grad_bias, handle_grad_bias = reduce_scatter_raw(
+                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
+                        )
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
@@ -676,7 +684,7 @@ def backward(ctx, grad_output, *args):
             if world_size > 1:
                 if overlap_handler is not None:
                     grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
-                        grad_weight, process_group, async_op=True
+                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
                     )
                     assert hasattr(weight, "_fstp_reduce_scatter_str")
                     overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (
@@ -693,7 +701,7 @@ def backward(ctx, grad_output, *args):
                     )
                     if grad_bias is not None:
                         grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(
-                            grad_bias, process_group, async_op=True
+                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
                         )
                         assert hasattr(bias, "_fstp_reduce_scatter_str")
                         overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (
@@ -709,9 +717,13 @@ def backward(ctx, grad_output, *args):
                             device=grad_bias.device,
                         )
                 else:
-                    grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                    grad_weight, handle_grad_weight = reduce_scatter_raw(
+                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
+                    )
                     if grad_bias is not None:
-                        grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                        grad_bias, handle_grad_bias = reduce_scatter_raw(
+                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
+                        )
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None

From d30aecddbc0401e37eacd3f5346ba6456275948f Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 19 Dec 2023 16:36:10 +0800
Subject: [PATCH 086/153] feat(core/context): support pp for initializing
 isp/msp/fsp process group

---
 configs/7B_sft.py                             |  20 +-
 internlm/core/context/__init__.py             |   2 -
 internlm/core/context/parallel_context.py     |  31 +-
 .../core/context/process_group_initializer.py | 479 ++++++++++--------
 internlm/initialize/launch.py                 |  35 +-
 train.py                                      |  28 +
 6 files changed, 348 insertions(+), 247 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 3c491660..822bcb52 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -152,21 +152,25 @@
     2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
+    2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
+        defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
+        msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
+        fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
+        isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
 pipeline parallel (dict):
     1. size: int, the size of pipeline parallel.
     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
         defaults to False.
+weight parallel (dict):
+    1. size: int, the size of weight parallel.
+    2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
     zero1=dict(size=2, fsdp=False),
-    tensor=dict(size=1, sp="intern", intern_overlap=False, memory_pool=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=8, overlap=True, memory_pool=True),
-    sequence=4,
+    tensor=dict(size=4, mode="mtp"),
+    pipeline=dict(size=2, interleaved_overlap=True),
+    weight=dict(size=1, overlap=True, memory_pool=True),
 )
 
 cudnn_deterministic = False
diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py
index f62d6a90..5382837e 100644
--- a/internlm/core/context/__init__.py
+++ b/internlm/core/context/__init__.py
@@ -10,7 +10,6 @@
 )
 from .process_group_initializer import (
     Initializer_Data,
-    Initializer_Model,
     Initializer_Nettest,
     Initializer_Pipeline,
     Initializer_Tensor,
@@ -44,7 +43,6 @@
     "Initializer_Nettest",
     "Initializer_Zero3_dp",
     "ProcessGroupInitializer",
-    "Initializer_Model",
     "seed",
     "set_mode",
     "add_seed",
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index c2fc574d..538d3947 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -476,18 +476,24 @@ def init_parallel_groups(self):
         parallel_config = self.config.get("parallel", None)
         if parallel_config is not None:
             self._set_parallel_size_from_config(parallel_config, "weight", "weight_parallel_size")
-            self._set_parallel_size_from_config(parallel_config, "sequence", "sequence_parallel_size")
-            self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size")
             self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size")
+            self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size")
             self._set_parallel_size_from_config(parallel_config, "zero1", "zero1_parallel_size")
 
         # the user should not set the data parallel size manually
         # instead, it should be calculated based on other parallel config
-        assert self.tensor_parallel_size == 1
-        assert self.pipeline_parallel_size == 1
         assert self.zero1_parallel_size >= 1
-        self.data_parallel_size = self.world_size // self.sequence_parallel_size
-        self.weight_data_parallel_size = self.world_size // self.weight_parallel_size
+        self.sequence_parallel_size = self.tensor_parallel_size
+        self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size
+        self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size
+        if parallel_config["tensor"]["mode"] != "isp":
+            assert (
+                self.zero1_parallel_size <= self.data_parallel_size
+            ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}"
+        else:
+            assert (
+                self.zero1_parallel_size <= self.weight_data_parallel_size
+            ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}"
 
         # the recommended nettest_parallel_size is 32 GPUs
         self.nettest_parallel_size = 32
@@ -508,6 +514,7 @@ def init_parallel_groups(self):
             rank,
             world_size,
             self.weight_parallel_size,
+            self.weight_data_parallel_size,
             self.sequence_parallel_size,
             self.data_parallel_size,
             self.pipeline_parallel_size,
@@ -520,12 +527,16 @@ def init_parallel_groups(self):
         # run initialization of different process groups
         initializers = []
         initializers.append(pgroup_initializer.Initializer_Weight(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Sequence(*initializer_args))
+        if parallel_config["tensor"]["mode"] == "isp":
+            initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Data(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
+        # if self.weight_parallel_size <= 1:
+        #     initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
+        if parallel_config["tensor"]["mode"] != "isp":
+            initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
+        else:
+            initializers.append(pgroup_initializer.Initializer_Zero1_ISP(*initializer_args))
         if isinstance(self.config.parallel.zero1, dict) and self.config.parallel.zero1.get("fsdp", False):
             initializers.append(pgroup_initializer.Initializer_Zero3_dp(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py
index ee81ac58..5e59df22 100644
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@@ -67,10 +67,14 @@ class ProcessGroupInitializer(ABC):
     Args:
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
+        sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
         zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
         expert_parallel_size (int): Size of expert parallel.
     """
 
@@ -79,6 +83,7 @@ def __init__(
         rank: int,
         world_size: int,
         weight_parallel_size: int,
+        weight_data_parallel_size: int,
         sequence_parallel_size: int,
         data_parallel_size: int,
         pipeline_parallel_size: int,
@@ -90,6 +95,7 @@ def __init__(
         self.rank = rank
         self.world_size = world_size
         self.weight_parallel_size = weight_parallel_size
+        self.weight_data_parallel_size = weight_data_parallel_size
         self.sequence_parallel_size = sequence_parallel_size
         self.data_parallel_size = data_parallel_size
         self.pipeline_parallel_size = pipeline_parallel_size
@@ -97,6 +103,8 @@ def __init__(
         self.zero1_parallel_size = zero1_parallel_size
         self.nettest_parallel_size = nettest_parallel_size
         self.expert_parallel_size = expert_parallel_size
+
+        assert sequence_parallel_size == tensor_parallel_size
         super().__init__()
 
     @abstractmethod
@@ -104,41 +112,50 @@ def init_dist_group(self, use_cpu: bool = False):
         pass
 
 
-# class Initializer_Data(ProcessGroupInitializer):
-#     """A ProcessGroupInitializer for data parallelism.
+# class Initializer_Model(ProcessGroupInitializer):
+#     """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
+#     groups).
 
 #     Args:
 #         rank (int): The rank of current process.
 #         world_size (int): Size of whole communication world.
+#         weight_parallel_size (int): Size of model weight parallel.
+#         weight_data_parallel_size (int): Size of data parallel for common weight.
+#         sequence_parallel_size (int): Size of data sequence parallel.
 #         data_parallel_size (int): Size of data parallel.
 #         pipeline_parallel_size (int): Size of pipeline parallel.
 #         tensor_parallel_size (int): Size of tensor parallel.
 #         zero1_parallel_size (int): Size of zero1 parallel.
+#         nettest_parallel_size (int): Size of net testing parallel.
 #         expert_parallel_size (int): Size of expert parallel.
 #     """
 
 #     def __init__(self, *args, **kwargs):
 #         super().__init__(*args, **kwargs)
-#         self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
 
-#         assert self.world_size % self.data_parallel_size == 0
+#         # only for msp or fsp
+#         assert self.weight_parallel_size == 1
+#         self.rank_num_per_group = self.tensor_parallel_size * self.pipeline_parallel_size
+#         self.num_group = self.world_size // self.rank_num_per_group
+
+#         assert self.world_size % self.rank_num_per_group == 0
 
 #     def init_dist_group(self, use_cpu: bool = False):
-#         """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
+#         """Initialize model parallel groups, and assign local_ranks and groups to each gpu.
 
 #         Returns:
 #             Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-#                 A Data parallelism's information tuple.
+#                 A Model parallelism's information tuple.
 #         """
 #         local_rank = None
 #         ranks_in_group = None
 #         process_group = None
 #         cpu_group = None
 #         group_world_size = None
-#         mode = ParallelMode.DATA
+#         mode = ParallelMode.MODEL
 
-#         for i in range(self.rank_num_per_dp_group):
-#             ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)]
+#         for i in range(self.num_group):
+#             ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)]
 #             group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
 #             if use_cpu:
 #                 group_cpu = (
@@ -159,138 +176,92 @@ def init_dist_group(self, use_cpu: bool = False):
 #         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
 
-class Initializer_Model(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
-    groups).
+class Initializer_Pipeline(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for pipeline parallelism.
 
     Args:
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
+        sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
         zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
         expert_parallel_size (int): Size of expert parallel.
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.rank_num_per_group = self.tensor_parallel_size * self.pipeline_parallel_size
-        self.num_group = self.world_size // self.rank_num_per_group
+        self.num_pp_group = self.world_size // self.pipeline_parallel_size
 
-        assert self.world_size % self.rank_num_per_group == 0
+        assert self.world_size % self.pipeline_parallel_size == 0
 
     def init_dist_group(self, use_cpu: bool = False):
-        """Initialize model parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.
 
         Returns:
-            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-                A Model parallelism's information tuple.
+            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
+                A Pipeline parallelism's information in list of tuples.
+
+        n=16 tp/sp=4 pp=2 dp=2 wp=8
+        wp grops: [0-7] [8-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
+        pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
+
+        n=16 tp/sp=4 pp=2 dp=2 wp=2
+        wp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
+        pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
         """
         local_rank = None
         ranks_in_group = None
         process_group = None
         cpu_group = None
         group_world_size = None
-        mode = ParallelMode.MODEL
+        mode = ParallelMode.PIPELINE
 
-        for i in range(self.num_group):
-            ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)]
-            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+        for i in range(self.num_pp_group):
+            ranks = [i + j * self.num_pp_group for j in range(self.pipeline_parallel_size)]
+            pipe_group_size = len(ranks)
+            pipe_group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
             if use_cpu:
                 group_cpu = (
                     dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
                     if dist.get_backend() != "gloo"
-                    else group
+                    else pipe_group
                 )
             else:
                 group_cpu = None
 
             if self.rank in ranks:
                 local_rank = ranks.index(self.rank)
-                group_world_size = len(ranks)
-                process_group = group
+                group_world_size = pipe_group_size
+                process_group = pipe_group
                 cpu_group = group_cpu
                 ranks_in_group = ranks
 
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
 
-class Initializer_Pipeline(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for pipeline parallelism.
-
-    Args:
-        rank (int): The rank of current process
-        world_size (int): Size of whole communication world
-        data_parallel_size (int): Size of data parallel
-        pipeline_parallel_size (int): Size of pipeline parallel
-        tensor_parallel_size (int): Size of tensor parallel
-        zero1_parallel_size (int): Size of zero1 parallel.
-        expert_parallel_size (int): Size of expert parallel.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.rank_num_per_dp_group = self.world_size // self.data_parallel_size
-        self.pipeline_stage_size = self.rank_num_per_dp_group // self.pipeline_parallel_size
-
-        assert self.world_size % self.data_parallel_size == 0
-        assert self.rank_num_per_dp_group % self.pipeline_parallel_size == 0
-
-    def init_dist_group(self, use_cpu: bool = False):
-        """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.
-
-        Returns:
-            List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]:
-                A Pipeline parallelism's information in list of tuples.
-        """
-        local_rank = None
-        ranks_in_group = None
-        process_group = None
-        cpu_group = None
-        group_world_size = None
-        mode = ParallelMode.PIPELINE
-
-        for i in range(self.data_parallel_size):
-            for j in range(self.pipeline_stage_size):
-                ranks = list(
-                    range(
-                        i * self.rank_num_per_dp_group + j,
-                        (i + 1) * self.rank_num_per_dp_group,
-                        self.pipeline_stage_size,
-                    )
-                )
-                pipe_group_size = len(ranks)
-                pipe_group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-                if use_cpu:
-                    group_cpu = (
-                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                        if dist.get_backend() != "gloo"
-                        else pipe_group
-                    )
-                else:
-                    group_cpu = None
-
-                if self.rank in ranks:
-                    local_rank = ranks.index(self.rank)
-                    group_world_size = pipe_group_size
-                    process_group = pipe_group
-                    cpu_group = group_cpu
-                    ranks_in_group = ranks
-
-        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
-
-
 class Initializer_Tensor(ProcessGroupInitializer):
     """A ProcessGroupInitializer for tensor parallelism.
 
     Args:
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
+        sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
         zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
         expert_parallel_size (int): Size of expert parallel.
     """
 
@@ -343,21 +314,106 @@ class Initializer_Zero1(ProcessGroupInitializer):
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
         weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
         sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
-        zero1_parallel_size (int): Size of zero-1 parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
+        expert_parallel_size (int): Size of expert parallel.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tensor_zero1_size = self.tensor_parallel_size * self.zero1_parallel_size
+        self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size
+        self.num_tensor_zero1_parallel_group = self.ranks_num_per_pp // self.tensor_zero1_size
+
+        assert self.world_size % (self.tensor_parallel_size * self.zero1_parallel_size) == 0
+        assert self.world_size % self.pipeline_parallel_size == 0
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize zero1 parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A zero1 parallelism's information tuple.
+
+        n=16 tp/sp=4 pp=2 dp=2 zero1=2
+        tp/sp grops: [0-3] [4-7] [8-11] [12-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
+        pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
+        zero1 groups: [0,4] [1,5] [2,6] [3,7]
+                      [8,12] [9,13] [10,14] [11,15]
+
+        n=16 tp/sp=2 pp=2 dp=4 zero1=2
+        tp/sp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15]
+        data groups: [0,2,4,6] [1,3,5,7]
+                     [8,10,12,14] [9,11,13,15]
+        pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
+        zero1 groups: [0,2] [1,3] [4,6] [5,7]
+                      [8,10] [9,11] [12,14] [13,15]
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.ZERO1
+
+        for i in range(self.pipeline_parallel_size):
+            for j in range(self.num_tensor_zero1_parallel_group):
+                for k in range(self.tensor_parallel_size):
+                    ranks = [
+                        i * self.ranks_num_per_pp + j * self.tensor_zero1_size + k + m * self.tensor_parallel_size
+                        for m in range(self.zero1_parallel_size)
+                    ]
+                    group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+                    if use_cpu:
+                        group_cpu = (
+                            dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                            if dist.get_backend() != "gloo"
+                            else group
+                        )
+                    else:
+                        group_cpu = None
+
+                    if self.rank in ranks:
+                        local_rank = ranks.index(self.rank)
+                        group_world_size = len(ranks)
+                        process_group = group
+                        cpu_group = group_cpu
+                        ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
+class Initializer_Zero1_ISP(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for zero-1 parallelism.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
+        sequence_parallel_size (int): Size of data sequence parallel.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+        zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
         expert_parallel_size (int): Size of expert parallel.
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.num_zero1_parallel_group = self.world_size // self.zero1_parallel_size
         self.weight_zero1_size = self.weight_parallel_size * self.zero1_parallel_size
-        self.num_weight_zero1_parallel_group = self.world_size // self.weight_zero1_size
+        self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size
+        self.num_weight_zero1_parallel_group = self.ranks_num_per_pp // self.weight_zero1_size
 
-        assert self.world_size % self.zero1_parallel_size == 0
+        assert self.world_size % (self.pipeline_parallel_size * self.zero1_parallel_size) == 0
         assert self.world_size % self.weight_zero1_size == 0
 
     def init_dist_group(self, use_cpu: bool = False):
@@ -371,6 +427,23 @@ def init_dist_group(self, use_cpu: bool = False):
         wp grops: [0-7] [8-15] [16-23] [24-31]
         zo1 groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
                     [16,24] [17,25] [18,26] [19,27] [20,28] [21,29] [22,30] [23,31]
+
+        n=16 tp/sp=4 pp=2 dp=2 wp=8 wdp=1 zero1=1
+        wp grops: [0-7] [8-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
+        wdp groups: [...]
+
+        n=16 tp/sp=4 pp=2 dp=2 wp=2 wdp=4 zero1=2
+        wp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
+        pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
+        wdp groups: [0,2,4,6] [1,3,5,7]
+                    [8,10,12,14] [9,11,13,15]
+        zero1 groups: [0,2] [1,3] [4,6] [5,7]
+                      [8,10] [9,11] [12,14] [13,15]
+        zero1=4: [0,2,4,6] [1,3,5,7] [8,10,12,14] [9,11,13,15]
         """
         local_rank = None
         ranks_in_group = None
@@ -379,28 +452,29 @@ def init_dist_group(self, use_cpu: bool = False):
         group_world_size = None
         mode = ParallelMode.ZERO1
 
-        for i in range(self.num_weight_zero1_parallel_group):
-            for j in range(self.weight_parallel_size):
-                ranks = [
-                    i * self.weight_zero1_size + j + k * self.weight_parallel_size
-                    for k in range(self.zero1_parallel_size)
-                ]
-                group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-                if use_cpu:
-                    group_cpu = (
-                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                        if dist.get_backend() != "gloo"
-                        else group
-                    )
-                else:
-                    group_cpu = None
-
-                if self.rank in ranks:
-                    local_rank = ranks.index(self.rank)
-                    group_world_size = len(ranks)
-                    process_group = group
-                    cpu_group = group_cpu
-                    ranks_in_group = ranks
+        for i in range(self.pipeline_parallel_size):
+            for j in range(self.num_weight_zero1_parallel_group):
+                for k in range(self.weight_parallel_size):
+                    ranks = [
+                        i * self.ranks_num_per_pp + j * self.weight_zero1_size + k + m * self.weight_parallel_size
+                        for m in range(self.zero1_parallel_size)
+                    ]
+                    group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+                    if use_cpu:
+                        group_cpu = (
+                            dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                            if dist.get_backend() != "gloo"
+                            else group
+                        )
+                    else:
+                        group_cpu = None
+
+                    if self.rank in ranks:
+                        local_rank = ranks.index(self.rank)
+                        group_world_size = len(ranks)
+                        process_group = group
+                        cpu_group = group_cpu
+                        ranks_in_group = ranks
 
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
@@ -688,11 +762,13 @@ class Initializer_Weight(ProcessGroupInitializer):
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
         weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
         sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
         zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
         expert_parallel_size (int): Size of expert parallel.
     """
 
@@ -738,63 +814,6 @@ def init_dist_group(self, use_cpu: bool = False):
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
 
-class Initializer_Sequence(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for data sequence parallelism.
-
-    Args:
-        rank (int): The rank of current process.
-        world_size (int): Size of whole communication world.
-        weight_parallel_size (int): Size of model weight parallel.
-        sequence_parallel_size (int): Size of data sequence parallel.
-        data_parallel_size (int): Size of data parallel.
-        pipeline_parallel_size (int): Size of pipeline parallel.
-        tensor_parallel_size (int): Size of tensor parallel.
-        zero1_parallel_size (int): Size of zero1 parallel.
-        expert_parallel_size (int): Size of expert parallel.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_sequence_parallel_group = self.world_size // self.sequence_parallel_size
-
-        assert self.world_size % self.sequence_parallel_size == 0
-
-    def init_dist_group(self, use_cpu: bool = False):
-        """Initialize data sequence parallel groups, and assign local_ranks and groups to each gpu.
-
-        Returns:
-            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-                A Sequence parallelism's information tuple.
-        """
-        local_rank = None
-        ranks_in_group = None
-        process_group = None
-        cpu_group = None
-        group_world_size = None
-        mode = ParallelMode.SEQUENCE
-
-        for i in range(self.num_sequence_parallel_group):
-            ranks = [i * self.sequence_parallel_size + j for j in range(self.sequence_parallel_size)]
-            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-            if use_cpu:
-                group_cpu = (
-                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                    if dist.get_backend() != "gloo"
-                    else group
-                )
-            else:
-                group_cpu = None
-
-            if self.rank in ranks:
-                local_rank = ranks.index(self.rank)
-                group_world_size = len(ranks)
-                process_group = group
-                cpu_group = group_cpu
-                ranks_in_group = ranks
-
-        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
-
-
 class Initializer_Data(ProcessGroupInitializer):
     """A ProcessGroupInitializer for data parallelism.
 
@@ -802,20 +821,24 @@ class Initializer_Data(ProcessGroupInitializer):
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
         weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
         sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
         zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
         expert_parallel_size (int): Size of expert parallel.
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.num_dp_group = self.sequence_parallel_size
+        self.num_dp_group = self.pipeline_parallel_size * self.sequence_parallel_size
+        self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size
 
         assert self.world_size % self.data_parallel_size == 0
         assert self.world_size % self.sequence_parallel_size == 0
+        assert self.world_size % self.pipeline_parallel_size == 0
 
     def init_dist_group(self, use_cpu: bool = False):
         """Initialize data parallel groups, and assign local_ranks and groups to each gpu.
@@ -824,9 +847,10 @@ def init_dist_group(self, use_cpu: bool = False):
             Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                 A Data parallelism's information tuple.
 
-        n=32 wp=8 sp=4 zo1=2
-        wp grops: [0-7] [8-15] [16-23] [24-31]
-        data groups: [0,4,8,12,16,20,24,28] [1,5,9,13,17,21,25,29] [2,6,10,14,18,22,26,30] [3,7,11,15,19,23,27,31]
+        n=16 tp/sp=4 pp=2 dp=2 wp=8
+        wp grops: [0-7] [8-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
         """
         local_rank = None
         ranks_in_group = None
@@ -835,24 +859,28 @@ def init_dist_group(self, use_cpu: bool = False):
         group_world_size = None
         mode = ParallelMode.DATA
 
-        for i in range(self.num_dp_group):
-            ranks = [i + j * self.sequence_parallel_size for j in range(self.data_parallel_size)]
-            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-            if use_cpu:
-                group_cpu = (
-                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                    if dist.get_backend() != "gloo"
-                    else group
-                )
-            else:
-                group_cpu = None
+        for i in range(self.pipeline_parallel_size):
+            for j in range(self.sequence_parallel_size):
+                ranks = [
+                    i * self.ranks_num_per_pp + j + k * self.sequence_parallel_size
+                    for k in range(self.data_parallel_size)
+                ]
+                group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+                if use_cpu:
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else group
+                    )
+                else:
+                    group_cpu = None
 
-            if self.rank in ranks:
-                local_rank = ranks.index(self.rank)
-                group_world_size = len(ranks)
-                process_group = group
-                cpu_group = group_cpu
-                ranks_in_group = ranks
+                if self.rank in ranks:
+                    local_rank = ranks.index(self.rank)
+                    group_world_size = len(ranks)
+                    process_group = group
+                    cpu_group = group_cpu
+                    ranks_in_group = ranks
 
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
@@ -864,20 +892,23 @@ class Initializer_Weight_Data(ProcessGroupInitializer):
         rank (int): The rank of current process.
         world_size (int): Size of whole communication world.
         weight_parallel_size (int): Size of model weight parallel.
+        weight_data_parallel_size (int): Size of data parallel for common weight.
         sequence_parallel_size (int): Size of data sequence parallel.
         data_parallel_size (int): Size of data parallel.
         pipeline_parallel_size (int): Size of pipeline parallel.
         tensor_parallel_size (int): Size of tensor parallel.
         zero1_parallel_size (int): Size of zero1 parallel.
+        nettest_parallel_size (int): Size of net testing parallel.
         expert_parallel_size (int): Size of expert parallel.
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.num_weight_dp_group = self.weight_parallel_size
-        self.weight_data_parallel_size = self.world_size // self.num_weight_dp_group
+        self.num_wdp_group_per_pp = self.world_size // self.pipeline_parallel_size // self.weight_data_parallel_size
+        self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size
 
-        assert self.world_size % self.weight_parallel_size == 0
+        assert self.world_size % self.pipeline_parallel_size == 0
+        assert self.world_size % (self.pipeline_parallel_size * self.weight_data_parallel_size) == 0
 
     def init_dist_group(self, use_cpu: bool = False):
         """Initialize weight's data parallel groups, and assign local_ranks and groups to each gpu.
@@ -886,10 +917,24 @@ def init_dist_group(self, use_cpu: bool = False):
             Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                 A WEIGHT_DATA parallelism's information tuple.
 
-        n=32 wp=8 sp=4 zo1=2
+        n=32 wp=8 sp=4 zo1=2 with nopp
         wp grops: [0-7] [8-15] [16-23] [24-31]
         weight data groups: [0,8,16,24] [1,9,17,25] [2,10,18,26] [3,11,19,27]
                             [4,12,20,28] [5,13,21,29] [6,14,22,30] [7,15,23,31]
+
+        n=16 tp/sp=4 pp=2 dp=2 wp=8 wdp=1
+        wp grops: [0-7] [8-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
+        wdp groups: [...]
+
+        n=16 tp/sp=4 pp=2 dp=2 wp=2 wdp=4
+        wp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15]
+        data groups: [0,4] [1,5] [2,6] [3,7]
+                     [8,12] [9,13] [10,14] [11,15]
+        pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15]
+        wdp groups: [0,2,4,6] [1,3,5,7]
+                    [8,10,12,14] [9,11,13,15]
         """
         local_rank = None
         ranks_in_group = None
@@ -898,23 +943,27 @@ def init_dist_group(self, use_cpu: bool = False):
         group_world_size = None
         mode = ParallelMode.WEIGHT_DATA
 
-        for i in range(self.num_weight_dp_group):
-            ranks = [i + j * self.weight_parallel_size for j in range(self.weight_data_parallel_size)]
-            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-            if use_cpu:
-                group_cpu = (
-                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                    if dist.get_backend() != "gloo"
-                    else group
-                )
-            else:
-                group_cpu = None
+        for i in range(self.pipeline_parallel_size):
+            for j in range(self.num_wdp_group_per_pp):
+                ranks = [
+                    i * self.ranks_num_per_pp + j + k * self.weight_parallel_size
+                    for k in range(self.weight_data_parallel_size)
+                ]
+                group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+                if use_cpu:
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else group
+                    )
+                else:
+                    group_cpu = None
 
-            if self.rank in ranks:
-                local_rank = ranks.index(self.rank)
-                group_world_size = len(ranks)
-                process_group = group
-                cpu_group = group_cpu
-                ranks_in_group = ranks
+                if self.rank in ranks:
+                    local_rank = ranks.index(self.rank)
+                    group_world_size = len(ranks)
+                    process_group = group
+                    cpu_group = group_cpu
+                    ranks_in_group = ranks
 
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 208af18f..af4c9698 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -79,6 +79,9 @@ def args_sanity_check():
     if "tensor" not in gpc.config.parallel:
         gpc.config.parallel._add_item("tensor", 1)
 
+    if "weight" not in gpc.config.parallel:
+        gpc.config.parallel._add_item("weight", dict(size=1, overlap=False, memory_pool=False))
+
     if isinstance(gpc.config.parallel.pipeline, int):
         pp = gpc.config.parallel.pipeline
     else:
@@ -307,22 +310,30 @@ def args_sanity_check():
             gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
         ), "sequence parallel does not support use_flash_attn=False"
 
+    # set default value for tensor parallel
     if isinstance(gpc.config.parallel["tensor"], int):
-        gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], sp="none", intern_overlap=False)
-    if gpc.config.parallel["tensor"].get("sp", None) is None:
-        gpc.config.parallel["tensor"]["sp"] = "none"
-    if gpc.config.parallel["tensor"].get("intern_overlap", None) is None:
-        gpc.config.parallel["tensor"]["intern_overlap"] = False
-    assert gpc.config.parallel["tensor"].get("sp", None) in [
-        "none",
-        "megatron",
-        "flash-attn",
-        "intern",
-    ], "invalid sp mode, only ['none', 'megatron', 'flash-attn', 'intern'] is supported"
+        gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="mtp")
+    if gpc.config.parallel["tensor"].get("mode", None) is None:
+        gpc.config.parallel["tensor"]["mode"] = "mtp"
+    assert gpc.config.parallel["tensor"].get("mode", None) in [
+        "mtp",
+        "msp",
+        "fsp",
+        "isp",
+    ], "invalid tensor parallel mode, only ['mtp', 'msp', 'fsp', 'isp'] is supported"
+
     # adapt to old version's sequence parallel config
-    if gpc.config.parallel["tensor"].get("sp", None) in ["megatron", "flash-attn", "intern"]:
+    if gpc.config.parallel["tensor"].get("mode", None) in ["msp", "fsp", "isp"]:
         gpc.config.parallel.sequence_parallel = True
 
+    # set default value for weight parallel
+    if gpc.config.parallel["weight"].get("overlap", None) is None:
+        gpc.config.parallel["weight"]["overlap"] = False
+    if gpc.config.parallel["weight"].get("memory_pool", None) is None:
+        gpc.config.parallel["weight"]["memory_pool"] = False
+    if gpc.config.parallel["tensor"]["mode"] != "isp":
+        assert gpc.config.parallel["weight"]["size"] <= 1, "weight parallel is only supported with isp"
+
     # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy
     if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1:
         assert (
diff --git a/train.py b/train.py
index 996d7465..b64d3011 100644
--- a/train.py
+++ b/train.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+from pickle import FALSE
 import socket
 import time
 import traceback
@@ -341,6 +342,33 @@ def main(args):
     initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
     assert hasattr(gpc, "config") and gpc.config is not None
 
+    print(
+        f"ht debug rank:{gpc.get_global_rank()} ranks_in_tp_group:{gpc.get_ranks_in_group(ParallelMode.TENSOR)}",
+        flush=True,
+    )
+    print(
+        f"ht debug rank:{gpc.get_global_rank()} ranks_in_wp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT)}",
+        flush=True,
+    )
+    print(
+        f"ht debug rank:{gpc.get_global_rank()} ranks_in_dp_group:{gpc.get_ranks_in_group(ParallelMode.DATA)}",
+        flush=True,
+    )
+    print(
+        f"ht debug rank:{gpc.get_global_rank()} ranks_in_pp_group:{gpc.get_ranks_in_group(ParallelMode.PIPELINE)}",
+        flush=True,
+    )
+    # print(
+    #     f"ht debug rank:{gpc.get_global_rank()} ranks_in_wdp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA)}",
+    #     flush=True,
+    # )
+    print(
+        f"ht debug rank:{gpc.get_global_rank()} ranks_in_zero1_group:{gpc.get_ranks_in_group(ParallelMode.ZERO1)}",
+        flush=True,
+    )
+
+    assert False
+
     # initialize monitor manager context
     with initialize_monitor_manager(
         job_name=gpc.config.JOB_NAME, alert_address=gpc.config.monitor.alert.feishu_alert_address

From e9cd5210a18c8aa66e24d1baea7121313cc3b25d Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 20 Dec 2023 18:56:20 +0800
Subject: [PATCH 087/153] feat(model): refactor model and optimizer for
 msp/fsp/isp

---
 internlm/core/context/__init__.py             |   11 +-
 internlm/core/context/parallel_context.py     |   31 +-
 internlm/core/naive_amp.py                    |    2 +-
 internlm/core/scheduler/pipeline_scheduler.py |    3 +-
 internlm/data/batch_sampler.py                |    2 +-
 internlm/initialize/initialize_trainer.py     |    4 +-
 internlm/model/embedding.py                   |    8 +-
 internlm/model/linear.py                      |   16 +-
 internlm/model/loss.py                        |    2 +-
 internlm/model/modeling_internlm.py           |   75 +-
 internlm/model/modeling_moe.py                |    2 +-
 internlm/model/multi_head_attention.py        |   21 +-
 internlm/solver/optimizer/__init__.py         |    1 -
 .../solver/optimizer/hybrid_zero_optim.py     |   91 +-
 .../solver/optimizer/hybrid_zero_optim2.py    | 1018 -----------------
 internlm/solver/optimizer/utils.py            |   64 +-
 internlm/train/training_internlm.py           |   83 +-
 internlm/train/utils.py                       |    7 +-
 internlm/utils/evaluation.py                  |    6 +-
 internlm/utils/parallel.py                    |   78 +-
 tests/test_training/test_loss.py              |    6 +-
 train.py                                      |   29 +-
 22 files changed, 264 insertions(+), 1296 deletions(-)
 delete mode 100644 internlm/solver/optimizer/hybrid_zero_optim2.py

diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py
index 5382837e..13da8f58 100644
--- a/internlm/core/context/__init__.py
+++ b/internlm/core/context/__init__.py
@@ -1,8 +1,7 @@
 from .parallel_context import (
-    IS_SEQUENCE_PARALLEL,
-    IS_TENSOR_PARALLEL,
+    IS_TENSOR_ZERO_PARALLEL,
+    IS_TENSOR_DATA_PARALLEL,
     IS_REPLICA_ZERO_PARALLEL,
-    IS_SEQUENCE_DATA_PARALLEL,
     IS_WEIGHT_ZERO_PARALLEL,
     Config,
     ParallelContext,
@@ -31,8 +30,10 @@
 
 __all__ = [
     "Config",
-    "IS_TENSOR_PARALLEL",
-    "IS_SEQUENCE_PARALLEL",
+    "IS_TENSOR_ZERO_PARALLEL",
+    "IS_TENSOR_DATA_PARALLEL",
+    "IS_REPLICA_ZERO_PARALLEL",
+    "IS_WEIGHT_ZERO_PARALLEL",
     "global_context",
     "ParallelContext",
     "ParallelMode",
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 538d3947..53416761 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -24,10 +24,12 @@
 from .process_group_initializer import ParallelMode
 from .random import add_seed, get_seeds, set_mode
 
-IS_TENSOR_PARALLEL = "is_tensor_parallel"
-IS_SEQUENCE_PARALLEL = "is_sequence_parallel"
+
 IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel"
-IS_SEQUENCE_DATA_PARALLEL = "is_sequence_data_parallel"
+# for isp, with optimizer split in dp group
+IS_TENSOR_DATA_PARALLEL = "is_tensor_data_parallel"
+# for mtp/msp/fsp, with optimizer split in zero1 group
+IS_TENSOR_ZERO_PARALLEL = "is_tensor_zero_parallel"
 IS_WEIGHT_ZERO_PARALLEL = "is_weight_zero_parallel"
 
 logger = get_logger(__file__)
@@ -249,30 +251,11 @@ def get_prev_global_rank(self, parallel_mode: ParallelMode):
 
         return ranks_in_group[(local_rank - 1) % world_size]
 
-    def is_using_dp(self):
+    def is_using_parallel_mode(self, parallel_mode):
         """Returns a boolean value indicating whether the current device is initilized with
         ParallelMode.DATA and its world_size is greater than 1.
         """
-        return self.is_initialized(ParallelMode.DATA) and self.get_world_size(ParallelMode.DATA) > 1
-
-    def is_using_tp(self):
-        """Returns a boolean value indicating whether the current device is initilized with
-        ParallelMode.TENSOR and its world_size is greater than 1.
-        """
-        return self.is_initialized(ParallelMode.TENSOR) and self.get_world_size(ParallelMode.TENSOR) > 1
-
-    def is_using_pp(self):
-        """Returns a boolean value indicating whether the current device is initilized with
-        ParallelMode.PIPELINE and its world_size is greater than 1.
-        """
-        return self.is_initialized(ParallelMode.PIPELINE) and self.get_world_size(ParallelMode.PIPELINE) > 1
-
-    def is_using_sequence(self):
-        """Returns a boolean value indicating whether the current device is initilized with
-        ParallelMode.SEQUENCE and its world_size is greater than 1.
-        """
-        return False
-        # return gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1
+        return self.is_initialized(parallel_mode) and self.get_world_size(parallel_mode) > 1
 
     def is_first_rank(self, parallel_mode: ParallelMode):
         """Returns a boolean value indicating whether the current device is the first one
diff --git a/internlm/core/naive_amp.py b/internlm/core/naive_amp.py
index fb04759b..ffd413b0 100644
--- a/internlm/core/naive_amp.py
+++ b/internlm/core/naive_amp.py
@@ -51,7 +51,7 @@ def __init__(
         self._sync_buf = sync_buffer
         self.dtype = dtype
 
-        if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
+        if gpc.is_using_parallel_mode(parallel_mode):
             self._process_group = gpc.get_group(parallel_mode)
             self._world_size = gpc.get_world_size(parallel_mode)
         else:
diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py
index efc9187a..622c91f6 100644
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@@ -135,8 +135,7 @@ def __init__(
 
         self.scatter_gather_tensors = (
             scatter_gather_tensors
-            and gpc.is_initialized(ParallelMode.TENSOR)
-            and gpc.get_world_size(ParallelMode.TENSOR) > 1
+            and gpc.is_using_parallel_mode(ParallelMode.TENSOR)
         )
 
         if gpc.config.parallel.sequence_parallel:
diff --git a/internlm/data/batch_sampler.py b/internlm/data/batch_sampler.py
index 16fd6fce..a94a7210 100644
--- a/internlm/data/batch_sampler.py
+++ b/internlm/data/batch_sampler.py
@@ -141,7 +141,7 @@ def get_dpsampler_dataloader(
     """
     _kwargs = kwargs.copy()
 
-    if add_sampler and gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1:
+    if add_sampler and gpc.is_using_parallel_mode(ParallelMode.DATA):
         sampler = DataParallelSampler(dataset, shuffle=shuffle, drop_last=drop_last)
     else:
         sampler = None
diff --git a/internlm/initialize/initialize_trainer.py b/internlm/initialize/initialize_trainer.py
index beb4a40f..7893dd54 100644
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@@ -68,7 +68,7 @@ def initialize_trainer(
     assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer"
 
     # gradient handler, only support PipelineSharedModuleGradientHandler now
-    if gpc.is_using_pp():
+    if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
         gpc.config.gradient_handler = [dict(type="PipelineSharedModuleGradientHandler")]
     gradient_handler_cfg = gpc.config.get("gradient_handler", [])
     gradient_handlers = []
@@ -84,7 +84,7 @@ def initialize_trainer(
         data_fn = None
     else:
         data_fn = unpack_data
-    if gpc.is_using_pp():
+    if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
         gpc.config.NUM_MICRO_BATCHES = gpc.config.data.micro_num
         tensor_shape = get_tensor_shape()
         use_interleaved = (
diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py
index 225a5f16..11c71b2c 100644
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@@ -44,7 +44,7 @@ def __init__(
 
         self.num_embeddings = num_embeddings
         self.embed_dim = embedding_dim
-        embed_dim_per_partition = embedding_dim // gpc.sequence_parallel_size
+        embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size
 
         self.padding_idx = padding_idx
         self.embed_args = args
@@ -55,10 +55,10 @@ def __init__(
     def forward(self, input_: Tensor) -> Tensor:
         output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
 
-        output = gather_forward_split_backward(output_parallel, ParallelMode.SEQUENCE, dim=-1)
+        output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
 
-        if gpc.config.parallel.sequence > 1:
-            output = split_forward_gather_backward(output, ParallelMode.SEQUENCE, dim=1)
+        if gpc.config.parallel.sequence_parallel:
+            output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
             # print(
             #     f"ht debug embed: rank:{gpc.get_global_rank()} output.shape:{output.shape} output:{output}",
             #     flush=True,
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index b92b2ee5..fc5175d9 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -403,28 +403,28 @@ def __init__(
         )
 
 
-def get_mlp_cls(sp_mode: str):
-    if sp_mode in ["none", "flash-attn"]:
+def get_mlp_cls(tp_mode: str):
+    if tp_mode in ["mtp", "fsp"]:
         mlp_cls = FeedForward
-    elif sp_mode == "megatron":
+    elif tp_mode == "msp":
         mlp_cls = MegatronFeedForward
     else:
         mlp_cls = FSTPFeedForward
     return mlp_cls
 
 
-def get_linear_cls(sp_mode: str, parallel_mode: str):
+def get_linear_cls(tp_mode: str, parallel_mode: str):
     if parallel_mode == "column":
-        if sp_mode in ["none", "flash-attn"]:
+        if tp_mode in ["mtp", "fsp"]:
             cls = ColumnParallelLinearTorch
-        elif sp_mode == "megatron":
+        elif tp_mode == "msp":
             cls = MegatronColumnParallelLinearTorch
         else:
             cls = FSTPLinear
     elif parallel_mode == "row":
-        if sp_mode in ["none", "flash-attn"]:
+        if tp_mode in ["mtp", "fsp"]:
             cls = RowParallelLinearTorch
-        elif sp_mode == "megatron":
+        elif tp_mode == "msp":
             cls = MegatronRowParallelLinearTorch
         else:
             cls = FSTPLinear
diff --git a/internlm/model/loss.py b/internlm/model/loss.py
index a634d2c7..ac92b4b9 100644
--- a/internlm/model/loss.py
+++ b/internlm/model/loss.py
@@ -28,7 +28,7 @@ def __init__(self, parallel_output=True, label_smoothing=0):
             self.loss_fn = FlashCrossEntropyLoss(
                 reduction="mean",
                 inplace_backward=True,
-                process_group=gpc.get_group(ParallelMode.SEQUENCE),
+                process_group=gpc.get_group(ParallelMode.TENSOR),
                 label_smoothing=label_smoothing,
             )  # The loss in this place is bound to the gather_output initialized by VocabParallelClassifier1D
         else:
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 400ad273..032fef91 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -9,14 +9,7 @@
 from flash_attn.modules.mlp import ParallelFusedMLP
 from torch import nn
 
-from internlm.core.context import (
-    IS_SEQUENCE_PARALLEL,
-    IS_TENSOR_PARALLEL,
-    IS_REPLICA_ZERO_PARALLEL,
-    IS_SEQUENCE_DATA_PARALLEL,
-    IS_WEIGHT_ZERO_PARALLEL,
-    ParallelMode,
-)
+from internlm.core.context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
 from internlm.model.embedding import Embedding1D
@@ -85,7 +78,7 @@ def __init__(
         use_scaled_init: bool = True,
         use_swiglu: bool = True,
         use_flash_attn: bool = True,
-        sp_mode: str = "none",
+        tp_mode: str = "mtp",
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -95,11 +88,13 @@ def __init__(
         self.use_flash_attn = use_flash_attn
 
         head_dim = hidden_size // num_attention_heads
+        self.tp_mode = tp_mode
+        parallel_mode = ParallelMode.WEIGHT if self.tp_mode == "isp" else ParallelMode.TENSOR
         self.mixer = MHA(
             embed_dim=hidden_size,
             num_heads=num_attention_heads,
-            process_group=gpc.get_group(ParallelMode.WEIGHT),
-            sequence_process_group=gpc.get_group(ParallelMode.SEQUENCE),
+            process_group=gpc.get_group(parallel_mode),
+            sequence_process_group=gpc.get_group(ParallelMode.TENSOR),
             dropout=attn_drop_rate,
             max_position_embeddings=max_position_embeddings,
             softmax_scale=1 / math.sqrt(head_dim),
@@ -111,7 +106,7 @@ def __init__(
             use_flash_attn=use_flash_attn,
             device=device,
             dtype=dtype,
-            sp_mode=sp_mode,
+            tp_mode=self.tp_mode,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -123,12 +118,12 @@ def __init__(
             self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
 
         if use_swiglu:
-            mlp_cls = get_mlp_cls(sp_mode)
+            mlp_cls = get_mlp_cls(self.tp_mode)
             self.mlp = mlp_cls(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
                 out_features=hidden_size,
-                process_group=gpc.get_group(ParallelMode.WEIGHT),
+                process_group=gpc.get_group(parallel_mode),
                 bias=False,
                 device=device,
                 dtype=dtype,
@@ -139,7 +134,7 @@ def __init__(
                 int(hidden_size * mlp_ratio),
                 out_features=hidden_size,
                 activation="gelu_approx",
-                process_group=gpc.get_group(ParallelMode.TENSOR),
+                process_group=gpc.get_group(parallel_mode),
                 bias1=False,
                 bias2=False,
                 sequence_parallel=gpc.config.parallel.sequence_parallel,
@@ -148,23 +143,6 @@ def __init__(
                 device=device,
                 dtype=dtype,
             )
-        for _, param in self.mlp.named_parameters():
-            # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-            #     setattr(param, IS_TENSOR_PARALLEL, True)
-            if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
-                setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
-        for param in self.norm1.parameters():
-            # if gpc.config.parallel.sequence_parallel is True:
-            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
-            # if gpc.config.parallel.weight.size > 1:
-            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
-            setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
-        for param in self.norm2.parameters():
-            # if gpc.config.parallel.sequence_parallel is True:
-            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
-            # if gpc.config.parallel.weight.size > 1:
-            #     setattr(param, IS_SEQUENCE_PARALLEL, True)
-            setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
         self.dropout2 = nn.Dropout(drop_rate)
         self.use_swiglu = use_swiglu
@@ -327,18 +305,14 @@ def __init__(
         super().__init__()
 
         checkpoint_layer_num = int(num_layers * checkpoint)
-        self.sp_mode = gpc.config.parallel["tensor"]["sp"]
-        if self.sp_mode == "none":
-            gpc.config.parallel.sequence_parallel = False
-        else:
-            gpc.config.parallel.sequence_parallel = True
+        self.tp_mode = gpc.config.parallel.tensor.mode
 
         if is_reward:
             head_cls = RewardModelLinear
         else:
             head_cls = (
                 ScaleColumnParallelLinear
-                if self.sp_mode in ["flash-attn", "none", "intern"]
+                if self.tp_mode in ["mtp", "fsp", "isp"]
                 else MegatronScaleColumnParallelLinear
             )
         if first:
@@ -357,11 +331,8 @@ def __init__(
                 )
             for _, param in self.embedding.named_parameters():
                 normal_(std=0.0052)(param)
-                # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                #     setattr(param, IS_TENSOR_PARALLEL, True)
-                if gpc.get_world_size(ParallelMode.SEQUENCE) > 1:
-                    setattr(param, IS_SEQUENCE_DATA_PARALLEL, True)
         self.embed_grad_scale = embed_grad_scale
+
         self.blocks = nn.ModuleList(
             [
                 PackedFlashBaseLayer1D(
@@ -383,7 +354,7 @@ def __init__(
                     use_scaled_init=use_scaled_init,
                     use_swiglu=use_swiglu,
                     use_flash_attn=use_flash_attn,
-                    sp_mode=self.sp_mode,
+                    tp_mode=self.tp_mode,
                 )
                 for lid in range(num_layers)
             ]
@@ -396,7 +367,7 @@ def __init__(
             self.head = head_cls(
                 in_features=hidden_size,
                 out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
-                process_group=gpc.get_group(ParallelMode.SEQUENCE),
+                process_group=gpc.get_group(ParallelMode.TENSOR),
                 bias=False,
                 device=device,
                 dtype=dtype,
@@ -404,16 +375,6 @@ def __init__(
             )
             for _, param in self.head.named_parameters():
                 normal_(std=0.0052)(param)
-                # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                #     setattr(param, IS_TENSOR_PARALLEL, True)
-                if gpc.get_world_size(ParallelMode.SEQUENCE) > 1:
-                    setattr(param, IS_SEQUENCE_DATA_PARALLEL, True)
-            for param in self.norm.parameters():
-                # if gpc.config.parallel.sequence_parallel is True:
-                #     setattr(param, IS_SEQUENCE_PARALLEL, True)
-                # if gpc.config.parallel.weight.size > 1:
-                #     setattr(param, IS_SEQUENCE_PARALLEL, True)
-                setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
         self.parallel_output = parallel_output
 
@@ -438,11 +399,9 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             assert len(indexes) == 1
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
-            # if the sequence parallel mode is 'intern', the indexes should also be split in sequence dimension.
-            if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern":
+            # if the sequence parallel mode is 'isp', the indexes should also be split in sequence dimension.
+            if gpc.config.parallel.sequence_parallel and self.tp_mode == "isp":
                 indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
-            if gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1:
-                indexes = split_forward_gather_backward(indexes, ParallelMode.SEQUENCE, dim=0)
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
 
diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
index 43489bc4..9d9f3238 100644
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@@ -9,7 +9,7 @@
 from flash_attn.modules.mlp import ParallelFusedMLP
 from torch import nn
 
-from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.core.naive_amp import set_fp32_attr_to_module
 from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 5d9e0a40..fb0309a5 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -10,8 +10,6 @@
 import torch.nn.functional as F
 from einops import rearrange
 
-from internlm.core.context import IS_WEIGHT_ZERO_PARALLEL
-
 try:
     from flash_attn.flash_attn_interface import flash_attn_unpadded_func
 except ImportError:
@@ -37,7 +35,7 @@
 from torch import Tensor, nn
 from torch.nn import Module
 
-from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding
 from internlm.model.linear import get_linear_cls
@@ -174,7 +172,7 @@ def __init__(
         use_flash_attn: bool = True,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
-        sp_mode: str = "none",
+        tp_mode: str = "mtp",
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
@@ -202,7 +200,7 @@ def __init__(
                 self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)
 
         # notice here should change bias=True
-        Wqkv_cls = get_linear_cls(sp_mode, "column")
+        Wqkv_cls = get_linear_cls(tp_mode, "column")
         self.Wqkv = Wqkv_cls(
             embed_dim,
             3 * embed_dim,
@@ -218,14 +216,14 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        if sp_mode == "intern":
+        if tp_mode == "isp":
             self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=sequence_process_group)
             self.inner_cross_attn = DistributedAttention(
                 self.inner_cross_attn, sequence_process_group=sequence_process_group
             )
 
         # output projection always have the bias (for now)
-        out_proj_cls = get_linear_cls(sp_mode, "row")
+        out_proj_cls = get_linear_cls(tp_mode, "row")
         self.out_proj = out_proj_cls(
             embed_dim,
             embed_dim,
@@ -234,15 +232,6 @@ def __init__(
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             **factory_kwargs,
         )
-        # need to assign tp attribute so that internlm know it is tensor parallel module
-        # if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-        #     for name in ["out_proj", "Wqkv"]:
-        #         for param in getattr(self, name).parameters():
-        #             setattr(param, IS_TENSOR_PARALLEL, True)
-        if gpc.get_world_size(ParallelMode.WEIGHT) > 1:
-            for name in ["out_proj", "Wqkv"]:
-                for param in getattr(self, name).parameters():
-                    setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
 
     def forward(self, x, seqlen=None, inference_params=None, **kwargs):
         if kwargs.get("indexes", None) is not None:
diff --git a/internlm/solver/optimizer/__init__.py b/internlm/solver/optimizer/__init__.py
index 309f2295..7c6a1c64 100644
--- a/internlm/solver/optimizer/__init__.py
+++ b/internlm/solver/optimizer/__init__.py
@@ -3,6 +3,5 @@
 
 from .fsdp_optimizer import FSDPadaptOptimizer
 from .hybrid_zero_optim import HybridZeroOptimizer, reload_zero_fp32_buff
-from .hybrid_zero_optim2 import HybridZeroOptimizer2
 
 __all__ = ["FSDPadaptOptimizer", "HybridZeroOptimizer", "reload_zero_fp32_buff"]
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 3092a625..681dfc9c 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -9,8 +9,13 @@
 import torch.distributed as dist
 from torch.optim import Optimizer
 
-from internlm.core.context import IS_SEQUENCE_PARALLEL, Config, ParallelMode
+from internlm.core.context import IS_REPLICA_ZERO_PARALLEL, Config, ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.context.parallel_context import (
+    IS_TENSOR_DATA_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
+    IS_TENSOR_ZERO_PARALLEL,
+)
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
     BucketStore,
@@ -71,6 +76,7 @@ def __init__(
         clip_grad_norm = zero_cfg.clip_grad_norm
         self._overlap_sync_grad = zero_cfg.overlap_sync_grad
         self._overlap_sync_param = zero_cfg.overlap_sync_param
+        self.use_isp = gpc.config.parallel.tensor.mode == "isp"
 
         super().__init__(optim=optimizer)
 
@@ -82,7 +88,8 @@ def __init__(
         # ParameterStore will manage the tensor buffers used for zero
         # it will not manage the tensors used by mixed precision training
         self._param_store = ParameterStore(ParallelMode.ZERO1)
-        self._grad_store = GradientStore(ParallelMode.DATA)
+        parallel_mode = ParallelMode.WEIGHT_DATA if self.use_isp else ParallelMode.DATA
+        self._grad_store = GradientStore(parallel_mode)
         self._bucket_store: List[BucketStore] = []
         self._accum_grad_buckets: List[BucketStore] = []
         self._bucket_in_progress = []
@@ -120,8 +127,10 @@ def __init__(
 
         self.rank_unique_id = (
             f"gpus-{gpc.get_world_size(ParallelMode.GLOBAL)}_"
-            + f"pp-{gpc.get_local_rank(ParallelMode.PIPELINE)}_"
+            + f"wp-{gpc.get_local_rank(ParallelMode.WEIGHT)}_"
             + f"tp-{gpc.get_local_rank(ParallelMode.TENSOR)}_"
+            + f"dp-{gpc.get_local_rank(ParallelMode.DATA)}_"
+            + f"pp-{gpc.get_local_rank(ParallelMode.PIPELINE)}_"
             + f"zo-{gpc.get_local_rank(ParallelMode.ZERO1)}.pt"
         )
         self.params_per_rank_id_dict = []
@@ -129,7 +138,7 @@ def __init__(
         if self._overlap_sync_param:
             assert self._param_bcast_sync_handler is not None
 
-        if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True:
+        if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True:
             self._fstp_handler = gpc.fstp_handler
         else:
             self._fstp_handler = None
@@ -148,17 +157,25 @@ def __init__(
 
             # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1
             # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode
-            zero_mode = (
-                ParallelMode.ZERO1
-                if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
-                else ParallelMode.EXPERT_DATA
-            )
+            # zero_mode = (
+            #     ParallelMode.ZERO1
+            #     if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
+            #     else ParallelMode.EXPERT_DATA
+            # )
+            zero_mode = param_group["optimizer_mode"]
+
             self._zero_local_rank.append(gpc.get_local_rank(zero_mode))
             self._zero_world_size.append(gpc.get_world_size(zero_mode))
             # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
             self._broadcast_parallel_mode.append(zero_mode)
-            self._bucket_store.append(BucketStore(group_id, param_group["dp_mode"]))
-            self._accum_grad_buckets.append(BucketStore(group_id, param_group["dp_mode"]))
+
+            if param_group["name"] != "embed_head" and self.use_isp:
+                grad_reduce_mode = ParallelMode.WEIGHT_DATA
+            else:
+                grad_reduce_mode = ParallelMode.DATA
+
+            self._bucket_store.append(BucketStore(group_id, grad_reduce_mode))
+            self._accum_grad_buckets.append(BucketStore(group_id, grad_reduce_mode))
 
             # assign parameters to ranks the params in the list are sorted
             params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group)
@@ -220,8 +237,6 @@ def __init__(
         # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled.
         self.skip_grad_reduce = False
 
-        # reduction hook is only used if overlapping communication
-        # if it is stage 1 without overlapping, no hook will be attached
         self._attach_reduction_hook()
 
     @property
@@ -307,12 +322,12 @@ def _define_and_attach(param, reduce_rank=None):
                         reduce_rank=reduce_rank,
                     )
 
-                    def reduction_sp_func():
+                    def reduction_layernorm_func():
                         handle = reduce_tensor(
                             param.grad,
                             dtype=None,
                             dst_rank=reduce_rank,
-                            parallel_mode=ParallelMode.TENSOR,
+                            parallel_mode=ParallelMode.WEIGHT if self.use_isp else ParallelMode.TENSOR,
                         )
                         handle.wait()
 
@@ -328,23 +343,24 @@ def accum_grad_hook(*args):  # pylint: disable=W0613
                         reduce_scatter_checker()
 
                     # define hook for sequence_parallel
-                    def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
+                    def extra_layernorm_reduce_grad_hook(*args):  # pylint: disable=W0613
                         if self.skip_grad_reduce is False:
-                            reduction_sp_func()
+                            reduction_layernorm_func()
 
                     # get the AccumulateGrad object of the param itself
                     # If these objects are not kept, reduction hooks may not be attached successfully.
                     accum_grad_obj = get_grad_accumulate_object(param)
                     self._grad_store.add_accumulate_grad_object(accum_grad_obj)
 
-                    # if sequence_parallel is True,
-                    # the grad of norm should be all-reduce across the tp process group
+                    # the grad of layernorm should be all-reduce across the global process group
+                    # here is the first stage all-reduce in tp/wp process group
+                    # the second stage all-reduce will be processed in reduce_grad_hook
                     if (
-                        gpc.config.parallel.sequence_parallel is True
-                        and hasattr(param, IS_SEQUENCE_PARALLEL)
-                        and getattr(param, IS_SEQUENCE_PARALLEL) is True
+                        gpc.config.parallel.weight.size > 1
+                        and hasattr(param, IS_REPLICA_ZERO_PARALLEL)
+                        and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True
                     ):
-                        accum_grad_obj.register_hook(reduce_grad_hook_sp)
+                        accum_grad_obj.register_hook(extra_layernorm_reduce_grad_hook)
 
                     # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
                     # we must keep up with reduce_grad_hook.
@@ -373,9 +389,9 @@ def belongs_to_current_rank(self, param) -> bool:
         :return: True if the parameter should be updated by the current rank. Otherwise false.
         :rtype: bool
         """
-        tensor_rank = self._param_store.get_param_rank(param)
+        tensor_ranks = self._param_store.get_param_rank(param)
         group_id = getattr(param, "group_id")
-        return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id])
+        return gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) in tensor_ranks
 
     def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None:
         for _param in bucket.get_param(reduce_rank):
@@ -592,11 +608,25 @@ def _compute_norm_with_stage(
     ):
         # compute norm for gradients that have been reduced
         params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
+        params_is_padding = False
         if len(params) == 0:
+            params_is_padding = True
             dtype = self.param_groups[group_id]["dtype"]
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
+            if group_id == 0:
+                for param in params:
+                    if self.use_isp:
+                        setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
+                    else:
+                        setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
+            elif group_id == 1:
+                for param in params:
+                    setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            else:
+                raise NotImplementedError("group_id > 1 is not yet implemented.")
+
         norm = 0
         if self._clip_grad_norm > 0:
             # this norm is before scaling, it will be very large
@@ -608,6 +638,17 @@ def _compute_norm_with_stage(
                 zero_mode=self._broadcast_parallel_mode[group_id],
             )
 
+        if params_is_padding:
+            for param in params:
+                if hasattr(param, IS_REPLICA_ZERO_PARALLEL):
+                    delattr(param, IS_REPLICA_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_DATA_PARALLEL)
+                if hasattr(param, IS_TENSOR_ZERO_PARALLEL):
+                    delattr(param, IS_TENSOR_ZERO_PARALLEL)
+                if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
+                    delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+
         return norm
 
     def _compute_param_norm_stage(
diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py
deleted file mode 100644
index fbfa20cd..00000000
--- a/internlm/solver/optimizer/hybrid_zero_optim2.py
+++ /dev/null
@@ -1,1018 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import math
-from functools import partial
-from typing import List, Optional
-
-import torch
-import torch.distributed as dist
-from torch.optim import Optimizer
-
-from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_REPLICA_ZERO_PARALLEL, Config, ParallelMode
-from internlm.core.context import global_context as gpc
-from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL
-from internlm.monitor import send_alert_message
-from internlm.solver.optimizer.store import (
-    BucketStore,
-    GradientStore,
-    ParameterStore,
-    TensorBucket,
-)
-from internlm.solver.optimizer.utils import (
-    DynamicGradScaler,
-    ParamBcastSyncHandler,
-    flatten,
-    get_grad_accumulate_object,
-    has_inf_or_nan,
-    reduce_tensor,
-    release_param_grad,
-    split_half_float_double,
-    sync_param,
-)
-from internlm.utils.common import get_current_device
-from internlm.utils.logger import get_logger
-from internlm.utils.megatron_timers import megatron_timer as timer
-from internlm.utils.timeout import llm_timeout
-
-from .base_optimizer import BaseOptimizer
-from .utils import compute_layer_norm, compute_norm, compute_param_norm
-
-inf = math.inf
-logger = get_logger(__file__)
-
-
-class HybridZeroOptimizer2(BaseOptimizer):
-    """
-    Hybrid Zero Optimizer.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        cpu_offload=False,
-        grad_scal_cfg: Config = None,
-        zero_cfg: Config = None,
-        param_bcast_sync_handler: ParamBcastSyncHandler = None,
-    ):
-        # DynamicGradScaler related args
-        if gpc.config.model.dtype is torch.float32:
-            initial_scale = 1
-        else:
-            initial_scale = grad_scal_cfg.fp16.initial_scale
-        min_scale = grad_scal_cfg.fp16.min_scale
-        growth_interval = grad_scal_cfg.fp16.growth_interval
-        growth_factor = grad_scal_cfg.growth_factor
-        backoff_factor = grad_scal_cfg.backoff_factor
-        hysteresis = grad_scal_cfg.hysteresis
-        max_scale = grad_scal_cfg.max_scale
-
-        # Zero related args
-        reduce_bucket_size = zero_cfg.reduce_bucket_size
-        clip_grad_norm = zero_cfg.clip_grad_norm
-        self._overlap_sync_grad = zero_cfg.overlap_sync_grad
-        self._overlap_sync_param = zero_cfg.overlap_sync_param
-
-        super().__init__(optim=optimizer)
-
-        self._cpu_offload = cpu_offload
-        self._zero_local_rank = []
-        self._zero_world_size = []
-        self._broadcast_parallel_mode = []
-
-        # ParameterStore will manage the tensor buffers used for zero
-        # it will not manage the tensors used by mixed precision training
-        self._param_store = ParameterStore(ParallelMode.ZERO1)
-        self._grad_store = GradientStore(ParallelMode.WEIGHT_DATA)
-        self._bucket_store: List[BucketStore] = []
-        self._accum_grad_buckets: List[BucketStore] = []
-        self._bucket_in_progress = []
-
-        # fp16 and fp32 params for mixed precision training
-        self._fp16_param_groups = dict()
-        self._fp32_flat_param_groups_of_current_rank = dict()
-
-        # communication params
-        # self._overlap_communication = overlap_communication
-        self._reduce_bucket_size = reduce_bucket_size
-
-        self._comm_bcast_stream = torch.cuda.Stream()
-
-        # gradient scaler
-        self.grad_scaler = DynamicGradScaler(
-            initial_scale=initial_scale,
-            min_scale=min_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            max_scale=max_scale,
-        )
-        self._found_overflow = torch.cuda.FloatTensor([0], device=get_current_device())
-
-        # gradient clipping
-        self._clip_grad_norm = clip_grad_norm
-
-        # need to record the rank in which parameter groups are not assigned parameters.
-        self.param_group_has_params = []
-        self.param_group_no_params_ranks = []
-        self.padding_grad = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device())
-        self.padding_tensor = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device())
-
-        self.rank_unique_id = (
-            f"gpus-{gpc.get_world_size(ParallelMode.GLOBAL)}_"
-            + f"wp-{gpc.get_local_rank(ParallelMode.WEIGHT)}_"
-            + f"sp-{gpc.get_local_rank(ParallelMode.SEQUENCE)}_"
-            + f"dp-{gpc.get_local_rank(ParallelMode.DATA)}_"
-            + f"wdp-{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}_"
-            + f"zo-{gpc.get_local_rank(ParallelMode.ZERO1)}.pt"
-        )
-        self.params_per_rank_id_dict = []
-        self._param_bcast_sync_handler = param_bcast_sync_handler
-        if self._overlap_sync_param:
-            assert self._param_bcast_sync_handler is not None
-
-        if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True:
-            self._fstp_handler = gpc.fstp_handler
-        else:
-            self._fstp_handler = None
-
-        # iterate over the param group in the optimizer
-        # partition these param groups for data parallel training
-        # and add buffers to parameter store for future access
-        for group_id, param_group in enumerate(self.optim.param_groups):
-            group_params = param_group["params"]
-
-            # set the dtype for each param group
-            param_group["dtype"] = group_params[0].dtype if len(group_params) != 0 else None
-
-            # add the fp16 params to fp16_param_groups for bookkeeping
-            self._fp16_param_groups[group_id] = group_params
-
-            # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1
-            # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode
-            # zero_mode = (
-            #     ParallelMode.ZERO1
-            #     if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
-            #     else ParallelMode.EXPERT_DATA
-            # )
-            zero_mode = param_group["optimizer_mode"]
-
-            self._zero_local_rank.append(gpc.get_local_rank(zero_mode))
-            self._zero_world_size.append(gpc.get_world_size(zero_mode))
-            # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
-            self._broadcast_parallel_mode.append(zero_mode)
-
-            grad_reduce_mode = ParallelMode.WEIGHT_DATA
-            if param_group["name"] == "embed_head":
-                grad_reduce_mode = ParallelMode.DATA
-
-            self._bucket_store.append(BucketStore(group_id, grad_reduce_mode))
-            self._accum_grad_buckets.append(BucketStore(group_id, grad_reduce_mode))
-
-            # assign parameters to ranks the params in the list are sorted
-            params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group)
-            self.param_group_no_params_ranks.append(no_params_ranks)
-            self.param_group_has_params.append(self._zero_local_rank[group_id] not in no_params_ranks)
-
-            # store the mapping between param to rank each param should belong to only one rank.
-            # we can skip the moe param and do not keep them in _param_store to save memory
-            # (means we need to deal with moe param in a different way), but it will increase
-            # complexity and reduce code readablity.
-            for rank, params in enumerate(params_per_rank):
-                # check whether any rank is not assigned params.
-                if len(params) != 0:
-                    self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params)
-                    for param in params:
-                        setattr(param, "group_id", group_id)
-                        self._param_store.set_param_to_rank(param, rank)
-
-            # move to cpu to make room to create the flat tensor
-            for param in group_params:
-                param.data = param.data.cpu()
-
-            # flatten the reordered tensors
-            # if param_group["name"] == "embed_head":
-            #     tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id)
-            #     with torch.no_grad():
-            #         flat_tensor = flatten(tensor_list)
-            #     flat_tensor = flat_tensor.data.cuda()
-            #     sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
-            #     # for rank in range(self._zero_world_size[group_id]):
-            #     self._param_store.add_flat_fp16_param_by_rank_group(
-            #         self._zero_local_rank[group_id], group_id, flat_tensor
-            #     )
-            # else:
-            for rank in range(self._zero_world_size[group_id]):
-                # No flat fp16 buffer is allocated if the process has no parameters.
-                if rank not in self.param_group_no_params_ranks[group_id]:
-                    tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
-                    with torch.no_grad():
-                        flat_tensor = flatten(tensor_list)
-                    flat_tensor = flat_tensor.data.cuda()
-                    self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
-                    sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
-
-            # create a copy of fp32 weights of the parameters for which this rank is responsible
-            # No flat fp32 buffer is allocated if the process has no parameters.
-            if self.param_group_has_params[group_id]:
-                fp16_flat_current_rank = self._param_store.get_flat_fp16_param_by_rank_group(
-                    self._zero_local_rank[group_id], group_id
-                )
-                fp32_flat_current_rank = fp16_flat_current_rank.float()
-                device = "cpu" if self._cpu_offload else get_current_device()
-                fp32_flat_current_rank = fp32_flat_current_rank.to(device)
-                fp32_flat_current_rank.requires_grad = True
-                self._fp32_flat_param_groups_of_current_rank[group_id] = fp32_flat_current_rank
-
-                # need to replace the params in the `params` field in the optimizer
-                # so that when the optimizer calls step(), it only updates the tensors
-                # managed by this data parallel rank
-                param_group["params"] = [fp32_flat_current_rank]
-
-            # set reduction state
-            for param in self._fp16_param_groups[group_id]:
-                self._param_store.set_param_reduction_state(param, False)
-
-        assert len(self._fp16_param_groups) != 0
-
-        # If a rank is not assigned any arguments, 'has_params' is False.
-        self.has_params = sum(self.param_group_has_params) != 0
-        # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled.
-        self.skip_grad_reduce = False
-
-        self._attach_reduction_hook()
-
-    @property
-    def zero_local_rank(self):
-        return self._zero_local_rank
-
-    @property
-    def zero_world_size(self):
-        return self._zero_world_size
-
-    @property
-    def loss_scale(self):
-        return self.grad_scaler.scale
-
-    @property
-    def num_param_groups(self):
-        return len(self._fp16_param_groups)
-
-    def _partition_param_list(self, group_id, param_group):
-        no_params_ranks = []
-        # if param_group["name"] == "embed_head":
-        #     params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])]
-        #     return params_per_rank, set(no_params_ranks)
-
-        params_per_rank = [[] for _ in range(self._zero_world_size[group_id])]
-        numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])]
-        self.params_per_rank_id_dict.append([[] for _ in range(self._zero_world_size[group_id])])
-        param_list = param_group["params"]
-
-        sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)
-        for i, param in enumerate(sorted_params):
-            global_id = str(i)
-            for j in range(len(param.size())):
-                global_id = "_".join([global_id, str(param.size()[j])])
-            if self._overlap_sync_param:
-                rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
-            else:
-                rank_to_go = numel_per_rank.index(min(numel_per_rank))
-            params_per_rank[rank_to_go].append(param)
-            self.params_per_rank_id_dict[-1][rank_to_go].append(global_id)
-            numel_per_rank[rank_to_go] += param.numel()
-
-        # check whether any rank is not assigned to parameters.
-        for rank, params in enumerate(params_per_rank):
-            if len(params) == 0:
-                no_params_ranks.append(rank)
-
-        if gpc.is_rank_for_log():
-            logger.info(  # pylint: disable=W1203
-                f"Number of elements on ranks: {numel_per_rank}, rank:{gpc.get_global_rank()}"
-            )
-
-        return params_per_rank, set(no_params_ranks)
-
-    def _is_moe_group(self, param_group):
-        return "moe" in param_group.keys() and param_group["moe"]
-
-    def _is_norm_group(self, param_group):
-        return "norm" in param_group.keys() and param_group["norm"]
-
-    def _is_gate_group(self, param_group):
-        return "gate" in param_group.keys() and param_group["gate"]
-
-    # TODO check expert dp is correct when enable moe and overlap both
-    def _attach_reduction_hook(self):
-        # we iterate over the fp16 params
-        # on each param, we register a hook to its AccumulateGrad object
-        for group_id in range(self.num_param_groups):
-            param_group = self._fp16_param_groups[group_id]
-            for param in param_group:
-                # we should not reduce the param in moe
-                if not param.requires_grad:
-                    continue
-
-                reduce_rank = None
-
-                def _define_and_attach(param, reduce_rank=None):
-                    reduction_func = partial(
-                        self._store_and_try_reduce_grads_by_bucket,
-                        param=param,
-                        reduce_rank=reduce_rank,
-                    )
-
-                    reduce_scatter_checker = partial(
-                        self._wait_reduce_scatter_and_accumulate_grads,
-                        param=param,
-                        reduce_rank=reduce_rank,
-                    )
-
-                    def reduction_sp_func():
-                        handle = reduce_tensor(
-                            param.grad,
-                            dtype=None,
-                            dst_rank=reduce_rank,
-                            parallel_mode=ParallelMode.WEIGHT,
-                        )
-                        handle.wait()
-
-                    # define hook
-                    # NOT IMPORTANT BUT GOOD TO KNOW:
-                    # args here is not grad, but allow_unreacable and accumulate_grad
-                    def reduce_grad_hook(*args):  # pylint: disable=W0613
-                        if self.skip_grad_reduce is False:
-                            reduction_func()
-
-                    # define hook for real gradient accumulation.
-                    def accum_grad_hook(*args):  # pylint: disable=W0613
-                        reduce_scatter_checker()
-
-                    # define hook for sequence_parallel
-                    def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
-                        if self.skip_grad_reduce is False:
-                            reduction_sp_func()
-
-                    # get the AccumulateGrad object of the param itself
-                    # If these objects are not kept, reduction hooks may not be attached successfully.
-                    accum_grad_obj = get_grad_accumulate_object(param)
-                    self._grad_store.add_accumulate_grad_object(accum_grad_obj)
-
-                    # if sequence_parallel is True,
-                    # the grad of norm should be all-reduce across the tp process group
-                    # if (
-                    #     gpc.config.parallel.sequence_parallel is True
-                    #     and hasattr(param, IS_SEQUENCE_PARALLEL)
-                    #     and getattr(param, IS_SEQUENCE_PARALLEL) is True
-                    # ):
-                    #     accum_grad_obj.register_hook(reduce_grad_hook_sp)
-
-                    if (
-                        gpc.config.parallel.weight.size > 1
-                        and hasattr(param, IS_REPLICA_ZERO_PARALLEL)
-                        and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True
-                    ):
-                        accum_grad_obj.register_hook(reduce_grad_hook_sp)
-
-                    # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
-                    # we must keep up with reduce_grad_hook.
-                    if self._fstp_handler is not None:
-                        accum_grad_obj.register_hook(accum_grad_hook)
-
-                    if self._overlap_sync_grad:
-                        accum_grad_obj.register_hook(reduce_grad_hook)
-
-                _define_and_attach(param, reduce_rank)
-
-    def accumulate_left_grads_after_backward(self):
-        if self._fstp_handler is None:
-            return
-
-        for group_id in range(self.num_param_groups):
-            self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id])
-
-    def belongs_to_current_rank(self, param) -> bool:
-        """
-        Check whether a parameter is supposed to be updated by the process of the current rank
-
-        :param tensor: A :class:`torch.Tensor` object
-        :type tensor: torch.Tensor
-
-        :return: True if the parameter should be updated by the current rank. Otherwise false.
-        :rtype: bool
-        """
-        tensor_ranks = self._param_store.get_param_rank(param)
-        group_id = getattr(param, "group_id")
-        return gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) in tensor_ranks
-
-    def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None:
-        for _param in bucket.get_param(reduce_rank):
-            if not hasattr(_param, "_fstp_reduce_scatter_str"):
-                continue
-
-            # wait and accumulate gardient.
-            _key = getattr(_param, "_fstp_reduce_scatter_str")
-            _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key]
-            _comm_handle.wait()
-            _param.grad.add_(_grad)
-
-            # release cuda memory.
-            if self._fstp_handler.enable_memory_pool:
-                self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
-            _grad = None
-            self._fstp_handler.reduce_scatter_handlers[_key] = None
-
-        bucket.reset_by_rank(reduce_rank)
-
-    def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional[int] = None):
-        param_size = param.numel()
-
-        group_id = getattr(param, "group_id")
-        current_bucket = self._accum_grad_buckets[group_id]
-
-        # check if the bucket is full
-        # if full, will reduce the grads already in the bucket
-        # after reduction, the bucket will be empty
-        if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
-            self._accum_grads_store_in_bucket(current_bucket, reduce_rank)
-
-        # otherwise, add the parameter into bucket.
-        current_bucket.add_num_elements_in_bucket(param_size, reduce_rank)
-        current_bucket.add_param(param, reduce_rank)
-
-    def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None):
-        param_size = param.numel()
-
-        # check if the bucket is full
-        # if full, will reduce the grads already in the bucket
-        # after reduction, the bucket will be empty
-        group_id = getattr(param, "group_id")
-        current_bucket = self._bucket_store[group_id]
-
-        if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
-            self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False)
-
-        # the param must not be reduced to ensure correctness
-        is_param_reduced = self._param_store.is_param_reduced(param)
-        if is_param_reduced:
-            msg = (
-                f"Parameter of size ({param.size()}) has already been reduced, "
-                + "duplicate reduction will lead to arithmetic incorrectness"
-            )
-            raise RuntimeError(msg)
-
-        # the param must have grad for reduction
-        assert param.grad is not None, f"Parameter of size ({param.size()}) has None grad, cannot be reduced"
-
-        current_bucket.add_num_elements_in_bucket(param_size, reduce_rank)
-        current_bucket.add_grad(param.grad, reduce_rank)
-        current_bucket.add_param(param, reduce_rank)
-
-    def _reduce_grads_stored_in_bucket(self, current_bucket, reduce_rank=None, last_bucket=False):
-        # reduce grads
-        self._reduce_grads_by_rank(
-            reduce_rank=reduce_rank,
-            grads=current_bucket.get_grad(reduce_rank=reduce_rank),
-            bucket_size=current_bucket.num_elements_in_bucket(reduce_rank),
-            group_id=current_bucket.get_param_group_id(),
-            dp_parallel_mode=current_bucket.get_dp_parallel_mode(),
-        )
-
-        params_in_bucket = current_bucket.get_param(reduce_rank=reduce_rank)
-
-        for param in params_in_bucket:
-            # the is_param_reduced flag should be False showing that
-            # this param is not reduced before calling self._reduce_grads_by_rank
-            is_param_reduced = self._param_store.is_param_reduced(param)
-
-            if is_param_reduced:
-                msg = (
-                    f"Parameter of size ({param.size()}) has been reduced, "
-                    + "duplicate reduction will lead to arithmetic incorrectness"
-                )
-                raise RuntimeError(msg)
-
-            # update the flag
-            self._param_store.set_param_reduction_state(param, True)
-
-            if self.belongs_to_current_rank(param):
-                self._param_store.add_reduced_param_for_compute_norm(param, last_bucket)
-            else:
-                self._param_store.add_previous_reduced_param(param)
-
-        current_bucket.reset_by_rank(reduce_rank)
-
-    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size, group_id, dp_parallel_mode):
-        grad_buckets_by_dtype = split_half_float_double(grads)
-        next_bucket_list = []
-        # add parameters into bucket for reduction
-        for tensor_list in grad_buckets_by_dtype:
-            param_bucket = TensorBucket(size=bucket_size)
-            for tensor in tensor_list:
-                param_bucket.add_to_bucket(tensor, allow_oversize=True)
-            if not param_bucket.is_empty():
-                self._reduce_and_copy(
-                    bucket=param_bucket, reduce_rank=reduce_rank, group_id=group_id, dp_parallel_mode=dp_parallel_mode
-                )
-            next_bucket_list.append(param_bucket)
-
-        # wait for the completion of previouce bucket list reduction, and do unflatten_and_copy()
-        # here we can also overlap the communication with some memcpy operation caused by bucket.flatten()
-        for bucket in self._bucket_in_progress:
-            bucket.commu_handle.wait()
-            bucket.unflatten_and_copy()
-            bucket.empty()
-        self._bucket_in_progress = []
-        self._param_store.clear_grads_of_previous_reduced_params()
-
-        # after the completion of bucket list reduction, add new buckets into _bucket_in_progress
-        self._bucket_in_progress = next_bucket_list.copy()
-
-    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank, group_id, dp_parallel_mode):
-        # flatten the tensors and do allreduce
-        bucket.flatten()
-        bucket.commu_handle = reduce_tensor(
-            tensor=bucket.get_flat_tensor(),
-            dtype=None,
-            dst_rank=reduce_rank,
-            parallel_mode=dp_parallel_mode,
-        )
-
-        # update the reduced tensor
-        if reduce_rank is None or reduce_rank == self._zero_local_rank[group_id]:
-            bucket.set_unflatten_and_copy_flag(flag=True)
-
-    def _has_inf_or_nan(self, tensor):
-        try:
-            tensor_mean = float(tensor.mean())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if tensor_mean == float("inf") or tensor_mean == -float("inf"):
-                return True
-            return False
-
-    def _sync_grad(self):
-        # update param already reduced flag
-        reduction_states = self._param_store.get_param_reduction_states()
-        for tensor, _ in reduction_states.items():
-            reduction_states[tensor] = False
-        self._param_store.reset_reduced_data_for_compute_norm()
-
-        # accumulate gradient
-        avg_gradients = self._grad_store._averaged_gradients
-        for group_id in range(self.num_param_groups):
-            # the following operations are performed only on the rank to which parameters are assigned.
-            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
-                param_group = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id)
-
-                if group_id not in avg_gradients:
-                    avg_gradients[group_id] = []
-
-                param_idx = 0
-                for param in param_group:
-                    if param.grad is not None:
-                        if len(avg_gradients[group_id]) == param_idx:
-                            avg_gradients[group_id].append(param.grad)
-                        else:
-                            avg_gradients[group_id][param_idx].add_(param.grad)
-                        param_idx += 1
-
-        # the gradients needed are stored in the avg_gradients buffer
-        # thus, can clear this
-        self.zero_grad()
-
-    def zero_grad(self, set_to_none=True):
-        """
-        Set parameter gradients to zero. If set_to_none = True, gradient
-        will be set to None to save memory.
-
-        :param set_to_none: Whether set the gradient to None. Default value is True.
-        :type set_to_none: bool
-        """
-        for _, param_group in self._fp16_param_groups.items():
-            for param in param_group:
-                if set_to_none:
-                    param.grad = None
-                elif param.grad is not None:
-                    param.grad.detach()
-                    param.grad.zero_()
-                else:
-                    pass
-
-    def backward(self, loss, retain_graph=False):
-        loss = self.loss_scale * loss
-        loss.backward(retain_graph=retain_graph)
-
-        # Gradients may not be fully synchronized here.
-
-    def _compute_norm_with_stage(
-        self,
-        group_id: int = 0,
-        last_bucket: bool = False,
-        last_stage: bool = False,
-        previous_norm=None,
-    ):
-        # compute norm for gradients that have been reduced
-        params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
-        if len(params) == 0:
-            dtype = self.param_groups[group_id]["dtype"]
-            grads = [self.padding_grad.to(dtype)]
-            params = [self.padding_tensor.to(dtype)]
-
-        norm = 0
-        if self._clip_grad_norm > 0:
-            # this norm is before scaling, it will be very large
-            norm = compute_norm(
-                gradients=grads,
-                parameters=params,
-                last_stage=last_stage,
-                previous_norm=previous_norm,
-                zero_mode=self._broadcast_parallel_mode[group_id],
-            )
-
-        return norm
-
-    def _compute_param_norm_stage(
-        self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_param_norms=None
-    ):
-        # compute norm for gradients that have been reduced
-        params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
-
-        total_param_norms = {}
-        if len(params) == 0:
-            dtype = self.param_groups[group_id]["dtype"]
-            grads = [self.padding_grad.to(dtype)]
-            params = [self.padding_tensor.to(dtype)]
-
-        if self._clip_grad_norm > 0:
-            total_param_norms = compute_param_norm(
-                grads,
-                params,
-                last_stage=last_stage,
-                previous_param_norms=previous_param_norms,
-                zero_mode=self._broadcast_parallel_mode[group_id],
-                is_moe_group=self._is_moe_group(self.optim.param_groups[group_id]),
-            )
-        return total_param_norms
-
-    @llm_timeout(func_name="optim_step")
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        Returns:
-            Union[bool, float]: Whether the gradient is success updated, and the gradient.
-        """
-        assert closure is None, "closure is not supported by step()"
-
-        # import pdb
-
-        # if gpc.get_global_rank() == 0:
-        #     pdb.set_trace()
-
-        # if not overlapping communication (no reduction hook is attached)
-        # we need to manually reduce these gradients
-        if not self._overlap_sync_grad:
-            for group_id in range(len(self._fp16_param_groups)):
-                for param in self._fp16_param_groups[group_id]:
-                    # we should not reduce the param in moe
-                    if param.grad is not None:
-                        self._store_and_try_reduce_grads_by_bucket(param)
-
-        # we need to reduce the gradients left in the communication bucket
-        for group_id in range(self.num_param_groups):
-            self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
-
-        # compute norm for gradients in the before bucket
-        groups_norms = []
-        groups_param_norms = []
-        for group_id in range(self.num_param_groups):
-            groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
-            if gpc.config.get("grad_norm_profiling", False):
-                groups_param_norms.append(self._compute_param_norm_stage(group_id=group_id))
-
-        # clear reduced grads
-        # grads in the last bucket is reduced
-        for bucket in self._bucket_in_progress:
-            bucket.commu_handle.wait()
-            bucket.unflatten_and_copy()
-            bucket.empty()
-        self._bucket_in_progress = []
-        self._param_store.clear_grads_of_previous_reduced_params()
-        # compute norm for gradients in the last bucket
-        total_norms = {}
-        total_param_norms = {}
-        total_layer_norms = {}
-        for group_id in range(self.num_param_groups):
-            group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
-            group_name = f"{group_id}_{group_name}"
-            total_norms[group_name] = self._compute_norm_with_stage(
-                group_id=group_id,
-                last_bucket=True,
-                last_stage=True,
-                previous_norm=groups_norms[group_id],
-            )
-            if gpc.config.get("grad_norm_profiling", False):
-                param_norms = self._compute_param_norm_stage(
-                    group_id=group_id,
-                    last_bucket=True,
-                    last_stage=True,
-                    previous_param_norms=groups_param_norms[group_id],
-                )
-                total_layer_norms[group_name], total_param_norms[group_name] = compute_layer_norm(
-                    param_norms=param_norms, loss_scale=self.loss_scale.item()
-                )
-
-            # Need to allreduce(avg) the norms across different ranks because moe params will not be synced
-            # during allreduce
-            if self._is_moe_group(self.optim.param_groups[group_id]):
-                # model and zero have been reduced!!!
-                pg = gpc.get_group(ParallelMode.EXPERT)
-                scaled_norm = total_norms[group_name] * 1.0 / float(gpc.get_world_size(ParallelMode.EXPERT))
-                scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float)
-                dist.all_reduce(scaled_norm_tensor, group=pg)
-                total_norms[group_name] = scaled_norm_tensor.item()
-        timer("sync_grad").start()
-        self._sync_grad()
-        timer("sync_grad").stop()
-
-        state, global_norms = self._step(closure=closure, norms=total_norms)
-        if gpc.config.get("grad_norm_profiling", False):
-            global_norms["layer_norms"] = total_layer_norms
-            global_norms["param_norms"] = total_param_norms
-
-        return state, global_norms
-
-    def _step(self, closure=None, norms=None):
-        assert closure is None, "closure is not supported by step()"
-
-        # check for overflow
-        found_inf = False
-        found_nan = False
-        # if there is INF values in grades, compute_norm func would also returns -1
-        # thus, we try to avoid call _check_overflow here
-        # found_inf = self._check_overflow()
-        # Because you may encounter inf when computing norm
-
-        if -1 in norms.values():
-            found_inf = True
-
-        if -2 in norms.values():
-            found_nan = True
-
-        loss_scale = float(self.loss_scale.item())  # backup
-        if gpc.config.model.dtype is not torch.float32:
-            self.grad_scaler.update(found_inf)
-
-        # update loss scale if overflow occurs
-        if found_inf:
-            if gpc.is_rank_for_log():
-                logger.warning("Overflow occurs, please check it.")
-                send_alert_message(
-                    address=gpc.config.monitor.alert.feishu_alert_address,
-                    message="Overflow occurs, please check it.",
-                )
-            self._grad_store._averaged_gradients = dict()
-            self.zero_grad()
-            return False, norms
-
-        if found_nan:
-            if gpc.is_rank_for_log():
-                logger.warning("Nan grad norm occurs, please check it.")
-                send_alert_message(
-                    address=gpc.config.monitor.alert.feishu_alert_address,
-                    message="Nan grad norm  occurs, please check it.",
-                )
-            self._grad_store._averaged_gradients = dict()
-            self.zero_grad()
-            return False, norms
-        # copy the grad of fp16 param to fp32 param
-        single_grad_partition_groups = []
-        for group_id in range(self.num_param_groups):
-            # compute norm
-            # The following operations are performed only on the rank to which parameters are assigned.
-            if not self.param_group_has_params[group_id]:
-                continue
-
-            # create flat gradient for the flat fp32 params
-            gradients = self._grad_store.get_averaged_gradients_by_group(group_id)
-            with torch.no_grad():
-                flat_fp16_avg_grads = flatten(gradients)
-            self._grad_store.reset_average_gradients_by_group(group_id)
-            gradients = None  # release cuda memory
-
-            dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype
-            flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)
-            flat_fp16_avg_grads = None  # release cuda memory
-
-            param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape
-            assert (
-                param_shape == flat_fp32_avg_grads.shape
-            ), f"fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}"
-
-            # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
-            # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
-            is_tp_sync_groups = (
-                self._is_norm_group(self.optim.param_groups[group_id]),
-                self._is_gate_group(self.optim.param_groups[group_id]),
-            )
-            if any(is_tp_sync_groups):
-                dist.all_reduce(
-                    flat_fp32_avg_grads,
-                    op=dist.ReduceOp.AVG,
-                    group=gpc.get_group(ParallelMode.TENSOR),
-                )
-
-            single_grad_partition_groups.append(flat_fp32_avg_grads)
-            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
-            self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
-        # unscale and clip grads
-        # get the global norm
-        global_norm_groups = {}
-        if self._clip_grad_norm > 0:
-            for group_name, norm in norms.items():
-                global_norm_groups[group_name] = norm**0.5
-
-        # the following operations are performed only on the rank to which parameters are assigned.
-        if gpc.config.model.dtype is not torch.float32:
-            if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
-                self._unscale_and_clip_grads(
-                    single_grad_partition_groups,
-                    list(global_norm_groups.values()),
-                    loss_scale,
-                )
-
-        # update the parameters
-        timer("step").start()
-
-        # For those ranks that are not assigned parameters, we just wait for other ranks
-        # to send them updated their own parameters.
-        if self.has_params:
-            self.optim.step()
-            # release the fp32 grad
-            release_param_grad(self._fp32_flat_param_groups_of_current_rank.values())
-            # update fp16 partition updated by the current rank
-            for group_id in range(len(self._fp16_param_groups)):
-                if self.param_group_has_params[group_id]:
-                    fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(
-                        rank=self._zero_local_rank[group_id], group_id=group_id
-                    )
-                    fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
-                    fp16_param.data.copy_(fp32_param)
-        torch.cuda.synchronize()
-        with torch.cuda.stream(self._comm_bcast_stream):
-            self.broadcast_params()
-
-        timer("step").stop()
-
-        # update gradients may not be needed here, because the sync_params function is used in initialization,
-        # so synchronization is maintained
-        for group_name, global_norm in global_norm_groups.items():
-            global_norm_groups[group_name] = global_norm / loss_scale
-        return True, global_norm_groups
-
-    def broadcast_params(self):
-        handles = []
-
-        for group_id in range(self.num_param_groups):
-            # if self.param_groups[group_id]["name"] == "embed_head":
-            #     continue
-            for rank in range(self._zero_world_size[group_id]):
-                # The following operations are performed only on the rank to which parameters are assigned.
-                if rank in self.param_group_no_params_ranks[group_id]:
-                    continue
-                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
-                # grank = gpc.get_ranks_in_group(group_type)[rank]  # need to convert to the global rank
-                # assert grank == rank, f"{grank} == {rank}"
-                g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode[group_id])[rank]
-                handle = dist.broadcast(
-                    fp16_param,
-                    src=g_rank,
-                    group=gpc.get_group(self._broadcast_parallel_mode[group_id]),
-                    async_op=True,
-                )
-
-                if self._overlap_sync_param:
-                    self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
-                else:
-                    handles.append(handle)
-
-        for handle in handles:
-            handle.wait()
-
-        torch.cuda.synchronize()
-
-    ##################
-    # FP16 Utilities #
-    ##################
-
-    def _check_overflow(self):
-        # clear previous overflow record
-        self._found_overflow.fill_(0.0)
-
-        # check for overflow
-        for group_id in range(len(self._fp16_param_groups)):
-            # The following operations are performed only on the rank to which parameters are assigned.
-            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
-                for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id):
-                    if avg_grad is not None and has_inf_or_nan(avg_grad):
-                        self._found_overflow.fill_(1.0)
-                        break
-        dist.all_reduce(
-            self._found_overflow,
-            op=dist.ReduceOp.MAX,
-            group=gpc.get_group(ParallelMode.GLOBAL),
-        )
-
-        return self._found_overflow.item() > 0
-
-    def _unscale_and_clip_grads(self, grad_groups_flat, total_norm_groups, loss_scale):
-        # compute combined scale factor for this group
-        combined_scale_groups = []
-
-        if self._clip_grad_norm > 0.0:
-            # norm is in fact norm*scale
-            for group_id, total_norm in enumerate(total_norm_groups):
-                combined_scale_groups.append(loss_scale)
-                clip = ((total_norm / loss_scale) + 1e-6) / self._clip_grad_norm
-                if clip > 1.0:
-                    combined_scale_groups[group_id] = clip * loss_scale
-
-        for group_id, grad in enumerate(grad_groups_flat):
-            grad.data.mul_(1.0 / combined_scale_groups[group_id])
-
-    def clip_grad_norm(self, model, max_norm):
-        # will conduct in the step()
-        pass
-
-    def state_dict(self):
-        states = {}
-        grad_scaler = self.grad_scaler.state_dict()
-        states["grad_scaler"] = grad_scaler
-        optim_states = self.optim.state_dict()
-        states["base_optim_states"] = optim_states
-
-        flat_fp32_weights = {}
-        for group_id, param in self._fp32_flat_param_groups_of_current_rank.items():
-            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
-                assert param.grad is None
-                flat_fp32_weights[group_id] = param
-        states["flat_fp32_weights"] = flat_fp32_weights
-        states["zero_devide_optim_plan"] = self.params_per_rank_id_dict
-
-        return states
-
-    def load_state_dict(self, states):
-        # TODO: Need to take into account the change in the number of DP.
-        assert "grad_scaler" in states, "Not found grad_scaler state!"
-        grad_scaler = states["grad_scaler"]
-        self.grad_scaler.load_state_dict(grad_scaler)
-        optim_states = states["base_optim_states"]
-        self.optim.load_state_dict(optim_states)
-
-        # load fp32 model weight.
-        flat_fp32_weights = states["flat_fp32_weights"]
-        assert set(flat_fp32_weights.keys()) == set(self._fp32_flat_param_groups_of_current_rank)
-        for group_id, param in flat_fp32_weights.items():
-            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
-                self_param = self._fp32_flat_param_groups_of_current_rank[group_id]
-                assert (
-                    self_param.shape == param.shape
-                ), f"The loaded parameter shape is inconsistent, {self_param.shape} != {param.shape}"
-                self_param.data.copy_(param.data)
-
-        # Load the fp16 model weights.
-        for group_id in range(len(self._fp16_param_groups)):
-            if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]:
-                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(
-                    rank=self._zero_local_rank[group_id], group_id=group_id
-                )
-                fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
-                fp16_param.data.copy_(fp32_param)
-
-        if "zero_devide_optim_plan" in states:
-            self.params_per_rank_id_dict = states["zero_devide_optim_plan"]
-
-
-def reload_zero_fp32_buff(optimizer):
-    # If we use AMP optimizer, we need to update its fp32 buffer as newly loaded weights value.
-    # Or we must ensure that loading model weights must be done before zero is initialized.
-    if isinstance(optimizer, HybridZeroOptimizer):
-        for group_id, param_group in enumerate(optimizer.optim.param_groups):
-            if optimizer.param_group_has_params[group_id]:
-                # flatten fp16 params have already been updated by 'load_model_checkpoint'
-                fp16_flat_current_rank = optimizer._param_store.get_flat_fp16_param_by_rank_group(
-                    optimizer._zero_local_rank[group_id], group_id
-                )
-                # param_group["params"] is fp32 flatten optimizer states of this zero rank.
-                param_group["params"][0].data.copy_(fp16_flat_current_rank.float())
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 7e760b85..42a9949f 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -14,18 +14,13 @@
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.context.parallel_context import (
-    IS_REPLICA_ZERO_PARALLEL,
-    IS_SEQUENCE_DATA_PARALLEL,
-    IS_WEIGHT_ZERO_PARALLEL,
-)
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.utils.common import get_current_device, get_tensor_norm, move_norm_to_cuda
 from internlm.utils.logger import get_logger
 from internlm.utils.parallel import (
-    is_model_parallel_parameter,
     is_replica_zero_parallel_parameter,
-    is_sequence_data_parallel_parameter,
+    is_tensor_data_parallel_parameter,
+    is_tensor_zero_parallel_parameter,
     is_weight_zero_parallel_parameter,
 )
 
@@ -219,7 +214,7 @@ def calc_lp(grads, norm_type):
     return norm
 
 
-def reduce_grads(gradients, parameters, fine_grained=False):
+def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False):
     parallel_grads = []
     if fine_grained:
         parallel_grads = {}
@@ -248,24 +243,19 @@ def append_grad(g, p):
         ):
             continue
         elif (
-            gpc.is_initialized(ParallelMode.TENSOR)
-            and not is_model_parallel_parameter(p)
-            and gpc.get_local_rank(ParallelMode.TENSOR) == 0
-        ):  # if not used in each chunk, such as layernorm
-            append_grad(g, p)
-        elif (
-            is_replica_zero_parallel_parameter(p) and gpc.get_local_rank(ParallelMode.WEIGHT) == 0
+            is_replica_zero_parallel_parameter(p) and gpc.get_local_rank(weight_parallel_mode) == 0
         ):  # if not used in each chunk, such as layernorm IS_REPLICA_ZERO_PARALLEL parameter group
             append_grad(g, p)
-        elif is_sequence_data_parallel_parameter(p):
-            # process all ranks for IS_SEQUENCE_DATA_PARALLEL parameter group
+        elif is_tensor_data_parallel_parameter(p):
+            # process all ranks for IS_TENSOR_DATA_PARALLEL parameter group
+            append_grad(g, p)
+        elif is_tensor_zero_parallel_parameter(p):
+            # process all ranks for IS_TENSOR_ZERO_PARALLEL parameter group
             append_grad(g, p)
         elif is_weight_zero_parallel_parameter(p):
             # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group
             append_grad(g, p)
-        elif is_model_parallel_parameter(p):
-            append_grad(g, p)
-        elif gpc.get_local_rank(ParallelMode.TENSOR) != 0:
+        elif gpc.get_local_rank(weight_parallel_mode) != 0:
             continue
         else:
             raise RuntimeError("Should not arrive here")
@@ -286,6 +276,7 @@ def compute_norm(
         Total norm of the parameters, need total_norm**(1/norm) before using.
     """
 
+    weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR
     enable_cuda_kernels = gradients[0].device.type == "cuda"
     # Norm parameters.
     norm_type = float(norm_type)
@@ -310,7 +301,7 @@ def compute_norm(
             )
         total_norm = total_norm_cuda[0].item()
     else:
-        tensor_parallel_grads = reduce_grads(gradients, parameters)
+        tensor_parallel_grads = reduce_grads(gradients, parameters, weight_parallel_mode)
 
         if norm_type == 2.0 and enable_cuda_kernels:
             tensor_parallel_norm = calc_l2_norm(tensor_parallel_grads) ** norm_type
@@ -331,17 +322,29 @@ def compute_norm(
         if previous_norm is not None:
             total_norm = total_norm + previous_norm
 
-        # Sum across all model-parallel GPUs.
-        if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL):
-            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE))
+        """
+        Sum across all model-parallel GPUs.
+        1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and 
+            gradients along the pp+zero dimensions from all ranks should be aggregated.
+        2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions from all ranks should be aggregated.
+        3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions from all ranks should be aggregated.
+        4. For the IS_WEIGHT_ZERO_PARALLEL parameter group, gradients along the wp+pp+zero dimensions from all ranks should be aggregated.
+        """
+        if is_tensor_data_parallel_parameter(parameters[0]):
+            if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
+                dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
+        elif is_tensor_zero_parallel_parameter(parameters[0]):
+            if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
+                dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
         else:
-            if gpc.is_initialized(ParallelMode.WEIGHT):
+            if gpc.is_using_parallel_mode(weight_parallel_mode):
                 dist.all_reduce(
                     total_norm,
                     op=dist.ReduceOp.SUM,
-                    group=gpc.get_group(ParallelMode.WEIGHT),
+                    group=gpc.get_group(weight_parallel_mode),
                 )
-        if gpc.is_initialized(ParallelMode.PIPELINE):
+
+        if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
             dist.all_reduce(
                 total_norm,
                 op=dist.ReduceOp.SUM,
@@ -349,11 +352,8 @@ def compute_norm(
             )
 
         # This is because we use zero1, so we need to use this reduction.
-        # TODO: Check zero group to be a subset of dp group.
-        # if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or (
-        #     hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL)
-        # ):
-        dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
+        if gpc.is_using_parallel_mode(zero_mode):
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
 
         if torch.is_tensor(total_norm):
             total_norm = total_norm.item()
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 8d786489..587c0035 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -20,7 +20,6 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.utils.data import ConcatDataset, DataLoader
 
-from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.context.random import set_mode
 from internlm.core.naive_amp import NaiveAMPModel
@@ -40,6 +39,9 @@
     FeedForward,
     RewardModelLinear,
     ScaleColumnParallelLinear,
+    BaseScaleColumnParallelLinear,
+    ColumnParallelLinear,
+    RowParallelLinear,
 )
 from internlm.model.multi_head_attention import MHA
 from internlm.model.overlap_handler import FSTPOverlapHandler
@@ -48,7 +50,7 @@
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.solver.beta2_scheduler import Beta2Scheduler
 from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
-from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer, HybridZeroOptimizer2
+from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
 from internlm.utils.common import DummyProfile
@@ -57,16 +59,72 @@
 from internlm.utils.parallel import (
     set_model_params_layer_name,
     sync_model_param,
-    sync_model_param_within_tp,
     sync_model_replica_param_group,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.timeout import llm_timeout
+from internlm.core.context import (
+    IS_TENSOR_ZERO_PARALLEL,
+    IS_REPLICA_ZERO_PARALLEL,
+    IS_TENSOR_DATA_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
+    ParallelMode,
+)
+from internlm.utils.parallel import (
+    is_replica_zero_parallel_parameter,
+    is_tensor_data_parallel_parameter,
+    is_tensor_zero_parallel_parameter,
+    is_weight_zero_parallel_parameter,
+)
 
 RMSNorm = try_import_RMSNorm()
 logger = get_logger(__file__)
 
 
+def set_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]):
+    def _check_module(module):
+        # layer_norm
+        if isinstance(module, (RMSNorm, nn.LayerNorm)):
+            for param in module.parameters():
+                setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
+
+        # embedding and head
+        if isinstance(module, (Embedding1D, ParallelGPT2Embeddings)) or isinstance(
+            module, BaseScaleColumnParallelLinear
+        ):
+            for param in module.parameters():
+                if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp":
+                    setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+                elif gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp":
+                    setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
+
+        # for linear module
+        if isinstance(module, (ColumnParallelLinear, RowParallelLinear)):
+            for param in module.parameters():
+                if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp":
+                    setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
+                elif gpc.is_initialized(ParallelMode.WEIGHT) and gpc.config.parallel.tensor.mode == "isp":
+                    setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
+
+    if not isinstance(model, nn.ModuleList):
+        model = [model]
+
+    for _chunk in model:
+        if isinstance(_chunk, NaiveAMPModel):
+            _chunk = _chunk.model
+
+        for name, module in _chunk.named_modules():
+            _check_module(module)
+
+        for name, param in _chunk.named_parameters():
+            assert (
+                is_replica_zero_parallel_parameter(param)
+                or is_tensor_data_parallel_parameter(param)
+                or is_tensor_zero_parallel_parameter(param)
+                or is_weight_zero_parallel_parameter(param)
+            ), f"parameter with name:{name} has no parallel attribution."
+
+
 @llm_timeout(func_name="initialize_model")
 def initialize_model():
     """
@@ -98,6 +156,8 @@ def initialize_model():
             sync_buffer=False,
         )
 
+    set_attr_for_param_groups(model)
+
     # This sync is very important, cause the model weights kept in optimizer are copied
     # from the origin parameters in the memory, so we should make sure the dp sync
     # does not influence the model weights in optimizer be different with the origin parameters.
@@ -105,19 +165,18 @@ def initialize_model():
 
     # This function is needed to make sure parameters that are not splitted by tensor parallelism are
     # the same across tensor parallelism.
-    sync_model_param_within_tp(model)
-
     sync_model_replica_param_group(model)
 
     # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
     # state in the same dp group are all the same.
-    set_mode(ParallelMode.WEIGHT_DATA)
+    random_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA
+    set_mode(random_mode)
 
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
 
     gpc.fstp_handler = None
-    if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True:
+    if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True:
         gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.WEIGHT))
 
     return model
@@ -185,15 +244,7 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
         eps=adam_cfg.adam_eps,
     )
 
-    if gpc.config.parallel.weight.size > 1:
-        optimizer = HybridZeroOptimizer2(
-            naive_optimizer,
-            grad_scal_cfg=gpc.config.grad_scaler,
-            zero_cfg=gpc.config.hybrid_zero_optimizer,
-            param_bcast_sync_handler=param_bcast_sync_handler,
-        )
-        logger.info("use HybridZeroOptimizer2 for new partition strategy...")
-    elif not gpc.config.parallel.zero1.fsdp:
+    if not gpc.config.parallel.zero1.fsdp:
         optimizer = HybridZeroOptimizer(
             naive_optimizer,
             grad_scal_cfg=gpc.config.grad_scaler,
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 382c46d5..54e75ccb 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -2,9 +2,10 @@
 
 import torch
 
-from internlm.core.context.parallel_context import IS_REPLICA_ZERO_PARALLEL, IS_SEQUENCE_DATA_PARALLEL, ParallelMode
+from internlm.core.context.parallel_context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
+from internlm.utils.parallel import is_tensor_data_parallel_parameter
 
 
 def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]:
@@ -111,7 +112,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
 
     # print(f"ht debug params_groups before split total len:{len(param_groups[0]['params'])}", flush=True)
 
-    # create new groups for IS_SEQUENCE_DATA_PARALLEL parameter group
+    # create new groups for IS_TENSOR_DATA_PARALLEL parameter group
     new_groups = {}
     new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
     # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1}
@@ -126,7 +127,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         # assign param
         origin_params = []
         for param in pgroup["params"]:
-            if hasattr(param, IS_SEQUENCE_DATA_PARALLEL) and getattr(param, IS_SEQUENCE_DATA_PARALLEL) is True:
+            if is_tensor_data_parallel_parameter(param):
                 new_groups["embed_head"]["params"].append(param)
             # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True:
             #     new_groups["layer_norm"]["params"].append(param)
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index c6e27a68..a7c790b3 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -11,7 +11,7 @@
 
 @contextmanager
 def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, grad_accum_batch_size, metric_hook_list):
-    if not gpc.is_using_pp():
+    if not gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
         prev_data_process_func = trainer.schedule.data_process_func
         prev_grad_accum_size = trainer.schedule._grad_accum_size
         prev_grad_accum_batch_size = trainer.schedule._grad_accum_batch_size
@@ -31,7 +31,7 @@ def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, grad_accum
 
 @contextmanager
 def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape, metric_hook_list):
-    if gpc.is_using_pp():
+    if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
         pre_data_process_func = trainer.schedule.data_process_func
         prev_num_microbatches = trainer.schedule.num_microbatches
         prev_tensor_shape = trainer.schedule.tensor_shape
@@ -101,7 +101,7 @@ def evaluate_on_val_dls(
             ):
                 moe_loss = None
                 with torch.inference_mode():
-                    if gpc.is_using_pp():
+                    if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
                         total_val_bsz = len(batch[1])
                         assert total_val_bsz % data_cfg.micro_bsz == 0
                         num_microbatches = total_val_bsz // data_cfg.micro_bsz
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index 966332a1..e354f3b2 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -5,9 +5,9 @@
 from torch import nn
 
 from internlm.core.context import (
-    IS_TENSOR_PARALLEL,
     IS_REPLICA_ZERO_PARALLEL,
-    IS_SEQUENCE_DATA_PARALLEL,
+    IS_TENSOR_DATA_PARALLEL,
+    IS_TENSOR_ZERO_PARALLEL,
     IS_WEIGHT_ZERO_PARALLEL,
     ParallelMode,
 )
@@ -15,25 +15,32 @@
 from internlm.core.naive_amp import NaiveAMPModel
 
 
-def is_model_parallel_parameter(p):
-    return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
-
-
 def is_replica_zero_parallel_parameter(p):
     return hasattr(p, IS_REPLICA_ZERO_PARALLEL) and getattr(p, IS_REPLICA_ZERO_PARALLEL)
 
 
-def is_sequence_data_parallel_parameter(p):
+def is_tensor_data_parallel_parameter(p):
+    return (
+        gpc.is_initialized(ParallelMode.TENSOR)
+        and gpc.config.parallel.tensor.mode == "isp"
+        and hasattr(p, IS_TENSOR_DATA_PARALLEL)
+        and getattr(p, IS_TENSOR_DATA_PARALLEL)
+    )
+
+
+def is_tensor_zero_parallel_parameter(p):
     return (
-        gpc.is_initialized(ParallelMode.SEQUENCE)
-        and hasattr(p, IS_SEQUENCE_DATA_PARALLEL)
-        and getattr(p, IS_SEQUENCE_DATA_PARALLEL)
+        gpc.is_initialized(ParallelMode.TENSOR)
+        and gpc.config.parallel.tensor.mode != "isp"
+        and hasattr(p, IS_TENSOR_ZERO_PARALLEL)
+        and getattr(p, IS_TENSOR_ZERO_PARALLEL)
     )
 
 
 def is_weight_zero_parallel_parameter(p):
     return (
         gpc.is_initialized(ParallelMode.WEIGHT)
+        and gpc.config.parallel.tensor.mode == "isp"
         and hasattr(p, IS_WEIGHT_ZERO_PARALLEL)
         and getattr(p, IS_WEIGHT_ZERO_PARALLEL)
     )
@@ -45,43 +52,22 @@ def sync_model_param(model):
     Args:
         model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
     """
-    if gpc.is_initialized(ParallelMode.WEIGHT_DATA) and gpc.get_world_size(ParallelMode.WEIGHT_DATA) > 1:
-        sync_moe_param = (
-            gpc.is_initialized(ParallelMode.EXPERT_DATA) and gpc.get_world_size(ParallelMode.EXPERT_DATA) > 1
-        )
-        for param in model.parameters():
-            if sync_moe_param and getattr(param, "is_expert", False):
-                ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA)
-                dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.EXPERT_DATA))
-            else:
-                ranks = gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA)
-                dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.WEIGHT_DATA))
-
-
-def sync_model_param_within_tp(model):
-    r"""This function is changed from colossalai, which is ``sync_model_param``.
-
-    We modified this function to make sure it only sync parameters within tensor parallelism
-    but they are not splitted by tensor parallelism.
-    This function is used to make sure parameters that are not splitted by tensor parallelism
-    are the same across each tensor parallelism.
-    For example, parameters like RMSNorm, LayerNorm...
 
-    Args:
-        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
-    """
-    parallel_mode = ParallelMode.TENSOR
-    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
-        for param in model.parameters():
-            if not is_model_parallel_parameter(param):
-                ranks = gpc.get_ranks_in_group(parallel_mode)
-                dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
+    sync_moe_param = gpc.is_using_parallel_mode(ParallelMode.EXPERT_DATA)
+    sync_parallel_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA
+    for param in model.parameters():
+        if sync_moe_param and getattr(param, "is_expert", False):
+            ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA)
+            dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.EXPERT_DATA))
+        else:
+            ranks = gpc.get_ranks_in_group(sync_parallel_mode)
+            dist.broadcast(param, src=ranks[0], group=gpc.get_group(sync_parallel_mode))
 
 
 def sync_model_replica_param_group(model):
     r"""This function is changed from colossalai, which is ``sync_model_param``.
 
-    We modified this function to make sure it only sync IS_REPLICA_ZERO_PARALLEL parameters in world size.
+    We modified this function to make sure it only sync IS_REPLICA_ZERO_PARALLEL parameters in tp or wp process group.
     This function is used to make sure parameters that are not splitted are the same across each rank.
     For example, parameters like RMSNorm, LayerNorm...
 
@@ -89,10 +75,12 @@ def sync_model_replica_param_group(model):
         model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
     """
 
-    for param in model.parameters():
-        if is_replica_zero_parallel_parameter(param):
-            ranks = gpc.get_ranks_in_group(ParallelMode.GLOBAL)
-            dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.GLOBAL))
+    parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.TENSOR
+    if gpc.is_using_parallel_mode(parallel_mode):
+        for param in model.parameters():
+            if is_replica_zero_parallel_parameter(param):
+                ranks = gpc.get_ranks_in_group(parallel_mode)
+                dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
 
 
 def get_parallel_log_file_name():
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index a30cfba0..51f49836 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -68,7 +68,9 @@ def train(
     ), f"pipeline parallel size: {gpc.get_world_size(ParallelMode.PIPELINE)} is not as expected {pp_size}"
     if interleaved:
         assert (
-            gpc.is_using_pp() and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks == num_chunks
+            gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
+            and hasattr(gpc.config.model, "num_chunks")
+            and gpc.config.model.num_chunks == num_chunks
         )
         assert gpc.config.parallel["pipeline"].get(
             "interleaved_overlap", False
@@ -134,7 +136,7 @@ def train(
         SchedulerMetricHook(
             metric=metric,
             skip=(
-                gpc.is_using_pp()
+                gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
                 and hasattr(gpc.config.model, "num_chunks")
                 and gpc.config.model.num_chunks > 1
                 and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
diff --git a/train.py b/train.py
index b64d3011..b76c100d 100644
--- a/train.py
+++ b/train.py
@@ -79,7 +79,7 @@ def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]:
             SchedulerMetricHook(
                 metric=metric,
                 skip=(
-                    gpc.is_using_pp()
+                    gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
                     and hasattr(gpc.config.model, "num_chunks")
                     and gpc.config.model.num_chunks > 1
                     and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
@@ -342,33 +342,6 @@ def main(args):
     initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
     assert hasattr(gpc, "config") and gpc.config is not None
 
-    print(
-        f"ht debug rank:{gpc.get_global_rank()} ranks_in_tp_group:{gpc.get_ranks_in_group(ParallelMode.TENSOR)}",
-        flush=True,
-    )
-    print(
-        f"ht debug rank:{gpc.get_global_rank()} ranks_in_wp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT)}",
-        flush=True,
-    )
-    print(
-        f"ht debug rank:{gpc.get_global_rank()} ranks_in_dp_group:{gpc.get_ranks_in_group(ParallelMode.DATA)}",
-        flush=True,
-    )
-    print(
-        f"ht debug rank:{gpc.get_global_rank()} ranks_in_pp_group:{gpc.get_ranks_in_group(ParallelMode.PIPELINE)}",
-        flush=True,
-    )
-    # print(
-    #     f"ht debug rank:{gpc.get_global_rank()} ranks_in_wdp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA)}",
-    #     flush=True,
-    # )
-    print(
-        f"ht debug rank:{gpc.get_global_rank()} ranks_in_zero1_group:{gpc.get_ranks_in_group(ParallelMode.ZERO1)}",
-        flush=True,
-    )
-
-    assert False
-
     # initialize monitor manager context
     with initialize_monitor_manager(
         job_name=gpc.config.JOB_NAME, alert_address=gpc.config.monitor.alert.feishu_alert_address

From e0cafb07bc833991602e1ad8c926820341317a20 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 21 Dec 2023 15:38:22 +0800
Subject: [PATCH 088/153] fix(overlap_handler.py): fix hook error and param
 group split

---
 internlm/core/context/parallel_context.py | 9 +++------
 internlm/model/overlap_handler.py         | 2 +-
 internlm/train/utils.py                   | 3 ++-
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 53416761..7e357234 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -275,15 +275,12 @@ def is_first_rank(self, parallel_mode: ParallelMode):
 
     def is_rank_for_log(self):
         """Returns a boolean value indicating whether the current device should print log."""
-        # is_log_rank = (
-        #     self.is_first_rank(ParallelMode.DATA)
-        #     and self.is_first_rank(ParallelMode.TENSOR)
-        #     and self.is_last_rank(ParallelMode.PIPELINE)
-        # )
         is_log_rank = (
-            self.is_first_rank(ParallelMode.WEIGHT)
+            self.is_first_rank(ParallelMode.TENSOR)
+            and self.is_first_rank(ParallelMode.WEIGHT)
             and self.is_first_rank(ParallelMode.DATA)
             and self.is_first_rank(ParallelMode.WEIGHT_DATA)
+            and self.is_last_rank(ParallelMode.PIPELINE)
         )
         return is_log_rank
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index c81b09d0..65473a6b 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -287,7 +287,7 @@ def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):  # pylint: dis
 
         def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):  # pylint: disable=W0613
             fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1]
-            if module in fstp_modules:
+            for module in fstp_modules:
                 self._all_gather_module_weight(module)
                 _wait_handle(module)
 
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 54e75ccb..7ef0cb81 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -139,7 +139,8 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         pgroup["optimizer_mode"] = ParallelMode.ZERO1
 
     # param groups may contain empty groups, such as fp32
-    param_groups.extend(new_groups.values())
+    if len(new_groups["embed_head"]["params"]) > 0:
+        param_groups.extend(new_groups.values())
 
     # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True)
     # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True)

From 7974a32632e1309d2dfafb644f6de781c93f2851 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 22 Dec 2023 19:13:16 +0800
Subject: [PATCH 089/153] fix(overlap_handler.py): fix clear weight error when
 activation ckpt is True

---
 internlm/model/overlap_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index 65473a6b..a5649eae 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -293,7 +293,7 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):  # pylint: disab
 
         def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
             _clear_handle(module)
-            if not self.model_checkpoint:
+            if not (self.model_checkpoint and self.is_forward is False):
                 _clear_weight(module)
 
         def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  # pylint: disable=W0613

From 3361350348801318cac32b7c81fdabe0347792a2 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 25 Dec 2023 12:02:23 +0800
Subject: [PATCH 090/153] fix(parallel_context.py): fix seed mode when TENSOR
 parallel

---
 internlm/core/context/parallel_context.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 7e357234..6e7efaae 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -599,8 +599,10 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False):
         # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform
         # additional random operations during the RowParallelLinear module building process.
         # set_mode(ParallelMode.DUMMY)
-        set_mode(ParallelMode.TENSOR)
-        set_mode(ParallelMode.WEIGHT)
+        if self.is_using_parallel_mode(ParallelMode.TENSOR):
+            set_mode(ParallelMode.TENSOR)
+        if self.is_using_parallel_mode(ParallelMode.WEIGHT):
+            set_mode(ParallelMode.WEIGHT)
 
         seeds = get_seeds()
         seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()])

From 9b22258a9a0eecbec334caf16326c8c556285cb9 Mon Sep 17 00:00:00 2001
From: "chenxun.p" <759046501@qq.com>
Date: Tue, 26 Dec 2023 10:21:29 +0800
Subject: [PATCH 091/153] feat(*) refactor fstp handler

---
 internlm/core/communication/isp.py            | 501 ++++++++++++++++++
 internlm/model/linear.py                      |  32 +-
 internlm/model/overlap_handler.py             | 391 --------------
 internlm/model/utils.py                       | 453 ++++++----------
 .../solver/optimizer/hybrid_zero_optim.py     |  26 +-
 internlm/train/training_internlm.py           |  32 +-
 train.py                                      |  19 +-
 7 files changed, 744 insertions(+), 710 deletions(-)
 create mode 100644 internlm/core/communication/isp.py
 delete mode 100644 internlm/model/overlap_handler.py

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
new file mode 100644
index 00000000..24e8201b
--- /dev/null
+++ b/internlm/core/communication/isp.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from typing import Dict, List, Union
+from functools import partial
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from torch import distributed as dist
+
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.core.naive_amp import NaiveAMPModel
+from internlm.core.scheduler import SchedulerHook
+from internlm.model.embedding import Embedding1D
+from internlm.model.linear import ISPLinear, ScaleColumnParallelLinear
+from internlm.model.utils import all_gather_raw, reduce_scatter_raw
+
+
+@dataclass
+class ISPCommModelConfig:
+    """
+    model config for isp communicator.
+    """
+    hidden_size: int = 0
+    mlp_ratio: float = 0
+    dtype: torch.dtype = torch.half
+    device: torch.device = torch.device("cuda")
+    modules: List[str] = None
+
+
+class MemoryPool:
+    """
+    memory pool for isp communication.
+    """
+
+    def __init__(
+        self,
+        model_conf: ISPCommModelConfig,
+        with_bias: bool = False,
+    ) -> None:
+        self._hidden_size = model_conf.hidden_size
+        self._mlp_ratio = model_conf.mlp_ratio
+        self._dtype = model_conf.dtype
+        self._device = model_conf.device
+        self._module_shapes = self._init_module_shape(model_conf.modules)
+
+        # due to intern sequence parallel communication overlap, we need
+        # **two** memory pools for current block weights and the next block weights.
+        self.__all_gather_pool_len = 2
+        # memory pool for weight all gather communications.
+        self._all_gather_weight_memory_pool = [
+            {
+                name: torch.zeros(shape, dtype=self._dtype, device=self._device).contiguous()
+                for name, shape in self._module_shapes.items()
+            }
+            for _ in range(self.__all_gather_pool_len)
+        ]
+        # memory pool for bias all gather communications.
+        if not with_bias:
+            self._all_gather_bias_memory_pool = None
+        else:
+            self._all_gather_bias_memory_pool = [
+                {
+                    name: torch.zeros(shape[0], dtype=self._dtype, device=self._device).contiguous()
+                    for name, shape in self._module_shapes.items()
+                }
+                for _ in range(self.__all_gather_pool_len)
+            ]
+
+        # memory pool for reduce scatter communications, allocated lazily.
+        self._reduce_scatter_memory_pool = {}
+        # memory pool for constant zero tensors, allocated lazily.
+        self._zero_const_pool = {}
+
+    def _init_module_shape(self, modules: List[str]) -> Dict[str, torch.Size]:
+        mlp_hidden_size = 256 * ((int(self._hidden_size * self._mlp_ratio) + 256 - 1) // 256)
+
+        # TODO: the memory pool should be more generic.
+        # Currently, it only supports llama-class models with specific naming structure.
+        static_shapes = {
+            "Wqkv": torch.Size((3 * self._hidden_size, self._hidden_size)),
+            "out_proj": torch.Size((self._hidden_size, self._hidden_size)),
+            "w1": torch.Size((mlp_hidden_size, self._hidden_size)),
+            "w2": torch.Size((mlp_hidden_size, self._hidden_size)),
+            "w3": torch.Size((self._hidden_size, mlp_hidden_size)),
+        }
+
+        return {name: static_shapes[name] for name in modules}
+
+    def allocate_constant_zero(self, size: tuple) -> torch.Tensor:
+        if size not in self._zero_const_pool:
+            self._zero_const_pool[size] = torch.zeros(*size, dtype=self._dtype, device=self._device).contiguous()
+
+        return self._zero_const_pool[size]
+
+    def allocate_all_gather_memory(self, block_index: int, module_name: str, is_bias: bool = False) -> torch.Tensor:
+        # TODO: should we trace the usage of each memory block to avoid reusing
+        # same memory block, which will hides some potential bugs.
+        if not is_bias:
+            mem = self._all_gather_weight_memory_pool[block_index % 2][module_name]
+        else:
+            enable_bias = self._all_gather_bias_memory_pool is not None
+            assert enable_bias, "memory bool for bias is disabled."
+
+            mem = self._all_gather_bias_memory_pool[block_index % 2][module_name]
+
+        return mem
+
+    def allocate_reduce_scatter_memory(self, key: tuple) -> torch.Tensor:
+        # if key not in dict
+        if key not in self._reduce_scatter_memory_pool:
+            self._reduce_scatter_memory_pool[key] = []
+
+        for index, mem_item in enumerate(self._reduce_scatter_memory_pool[key]):
+            if mem_item.idle is True:
+                self._reduce_scatter_memory_pool[key][index].idle = False
+                return self._reduce_scatter_memory_pool[key][index]
+
+        # if the memory pool is all used
+        new_item = torch.zeros(
+            key,
+            dtype=self._dtype,
+            device=self._device,
+        ).contiguous()
+        setattr(new_item, "idle", False)
+        setattr(new_item, "index", len(self._reduce_scatter_memory_pool[key]))
+        self._reduce_scatter_memory_pool[key].append(new_item)
+
+        return new_item
+
+    def free_reduce_scatter_memory(self, key, index):
+        self._reduce_scatter_memory_pool[key][index].idle = True
+
+    def reset_lazy_pools(self) -> None:
+        # Should memory pool re-allocate all gather memory for every interation?
+        # Currently, it just clear the memory pool for reduce scatter communication.
+        self._zero_const_pool = {}
+        self._reduce_scatter_memory_pool = {}
+
+
+class ISPCommunicator:
+    """
+    ISP Communicator for managing the all-gather and reduce_scatter of Intern Sequence Parallel.
+    """
+
+    def __init__(
+        self,
+        model: Union[nn.Module, nn.ModuleList],
+        model_conf: ISPCommModelConfig,
+        overlap: bool = False,
+        activation_checkpointing: bool = False,
+        enable_memory_pool: bool = False,
+        process_group: dist.ProcessGroup = None,
+    ) -> None:
+        self.process_group = process_group
+        self.model_checkpoint = activation_checkpointing
+        self.overlap = overlap
+        self.enable_memory_pool = overlap and enable_memory_pool
+        self.model_conf = model_conf
+        self.is_forward = True
+
+        self._isp_outs = []
+        self._isp_modules = []
+        self._module_name = model_conf.modules.copy()
+
+        # key: isp module; value: module global all-gather op handle
+        self._weight_global_handle = {}
+        # key: isp module; value: module bias global all-gather op handle
+        self._bias_global_handle = {}
+        self.reduce_scatter_handlers = {}
+        # key: isp module; value: module global weight after all-gather op
+        self._weight_global_output = {}
+        # key: isp module; value: module bias global weight after all-gather op
+        self._bias_global_output = {}
+        # key: isp module; value: transformer block index
+        self._module_to_index = {}
+        # key: transformer block index; value: isp modules
+        self._index_to_isp_module = {}
+        self._last_block = None
+        self._head = []
+        self._embedding = []
+
+        # just want to share same for loop for ModuleList and Module
+        model = model if isinstance(model, nn.ModuleList) else [model]
+        for chunk in model:
+            if isinstance(chunk, NaiveAMPModel):
+                chunk = chunk.model
+            self._parse_model_structure(chunk)
+
+        self.num_blocks = len(self._index_to_isp_module)
+
+        if self.enable_memory_pool:
+            self.memory_pool = MemoryPool(model_conf)
+        else:
+            self.memory_pool = None
+
+        if self.overlap:
+            self._register_sync_parameters_hook()
+
+    def _parse_model_structure(self, model: nn.Module) -> None:
+        # Important: only works for llama-class models
+        for chunk_name, children in model.named_children():
+            if isinstance(children, ScaleColumnParallelLinear):
+                setattr(children, "isp_name", "head")
+                self._head.append(children)
+            elif isinstance(children, Embedding1D):
+                self._embedding.append(children)
+            elif isinstance(children, nn.ModuleList):
+                self._last_block = children[-1]
+
+                for idx, block in enumerate(children):
+                    self._index_to_isp_module[idx] = []
+                    for sub_name, sub in block.named_children():
+                        for name, child in sub.named_children():
+                            if name == "out_proj":
+                                self._isp_outs.append(child)
+                                self._module_to_index[child] = idx
+                            if isinstance(child, ISPLinear):
+                                self._module_to_index[child] = idx
+                                self._isp_modules.append(child)
+                                self._index_to_isp_module[idx].append(child)
+
+                                setattr(child, "isp_name", name)
+
+                                full_name = f"{chunk_name}.{idx}.{sub_name}.{name}"
+                                setattr(
+                                    child.weight,
+                                    "isp_reduce_scatter_name",
+                                    f"{full_name}.weight",
+                                )
+                                if child.bias is not None:
+                                    setattr(
+                                        child.bias,
+                                        "isp_reduce_scatter_name",
+                                        f"{full_name}.bias",
+                                    )
+
+    def _all_gather_module_weight(self, module):
+        with_bias = module.bias is not None
+        block_index = self._module_to_index[module]
+
+        # prepare memory pool allocator for weight and bias.
+        if self.enable_memory_pool:
+            weight_memory_pool_allocator = partial(
+                self.memory_pool.allocate_all_gather_memory,
+                block_index,
+                module.isp_name,
+            )
+        else:
+            weight_memory_pool_allocator = None
+
+        if self.enable_memory_pool and with_bias:
+            bias_memory_pool_allocator = partial(
+                self.memory_pool.allocate_all_gather_memory,
+                block_index,
+                module.isp_name,
+                is_bias=True,
+            )
+        else:
+            bias_memory_pool_allocator = None
+
+        # submit the all-gather communication for weight and bias.
+        if with_bias:
+            bias_output, bias_handle = all_gather_raw(
+                module.bias,
+                self.process_group,
+                async_op=True,
+                memory_pool_allocator=bias_memory_pool_allocator,
+            )
+            self._bias_global_handle[module] = bias_handle
+            self._bias_global_output[module] = bias_output
+
+        weight_output, weight_handle = all_gather_raw(
+            module.weight,
+            self.process_group,
+            async_op=True,
+            memory_pool_allocator=weight_memory_pool_allocator,
+        )
+        self._weight_global_handle[module] = weight_handle
+        self._weight_global_output[module] = weight_output
+
+    def _all_gather_block_weight(self, block_index: int):
+        for module in self._index_to_isp_module[block_index]:
+            self._all_gather_module_weight(module)
+
+    def _wait_handle(self, module):
+        handle = self._weight_global_handle[module]
+        handle.wait()
+        if module.bias is not None:
+            bias_handle = self._bias_global_handle[module]
+            bias_handle.wait()
+
+    def _clear_handle(self, module):
+        if module in self._weight_global_handle:
+            del self._weight_global_handle[module]
+        if module in self._bias_global_handle:
+            del self._bias_global_handle[module]
+
+    def _clear_weight(self, module):
+        if module in self._weight_global_output:
+            del self._weight_global_output[module]
+        if module in self._bias_global_output:
+            del self._bias_global_output[module]
+
+    def _post_forward_hook_for_embedding(self, *args):  # pylint: disable=W0613
+        """
+        prefetch weight for block 0 after embedding forward.
+        """
+        self._all_gather_block_weight(0)
+
+    def _pre_forward_hook_for_out_proj(self, module: nn.Module, *args):  # pylint: disable=W0613
+        block_index = self._module_to_index[module]
+
+        if self.model_checkpoint and self.is_forward is False:
+            if block_index - 1 >= 0:
+                self._all_gather_block_weight(block_index - 1)
+        else:
+            # start the all-gather for next block
+            if block_index + 1 < self.num_blocks:
+                self._all_gather_block_weight(block_index + 1)
+
+    def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
+        if module not in self._weight_global_handle:
+            self._all_gather_module_weight(module)
+
+        self._wait_handle(module)
+
+    def _pre_forward_hook_for_block(self, *args):  # pylint: disable=W0613
+        for module in self._index_to_isp_module[self.num_blocks - 1]:
+            self._all_gather_module_weight(module)
+            self._wait_handle(module)
+
+    def _post_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
+        self._clear_handle(module)
+        if not self.model_checkpoint:
+            self._clear_weight(module)
+
+    def _post_backward_hook_for_head(self, *args):  # pylint: disable=W0613
+        self._all_gather_module_weight(self._isp_modules[-1])
+
+    def _pre_backward_hook_for_head(self, *args):  # pylint: disable=W0613
+        if self.is_forward is False:
+            self._all_gather_block_weight(self.num_blocks - 1)
+
+    def _pre_backward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
+        # wait handle for current module
+        if module not in self._weight_global_handle:
+            self._all_gather_module_weight(module)
+
+        self._wait_handle(module)
+
+        # start the all-gather for next module
+        module_index = self._isp_modules.index(module)
+        if module_index - 1 >= 0:
+            next_module = self._isp_modules[module_index - 1]
+            self._all_gather_module_weight(next_module)
+
+    def _post_backward_hook_for_module(self, module, *args):  # pylint: disable=W0613
+        self._clear_handle(module)
+        self._clear_weight(module)
+
+    def _register_sync_parameters_hook(self) -> None:
+        """
+        register forward hooks and backward hooks for isp modules.
+        """
+        # register forward hooks
+        # 1. register post_forward_hook @embedding module to prefetch for block 0
+        # 2. register pre_forward_hook @out_proj module to prefetch for next block,
+        #    notice that next block's all_gather op should be after current block's all_to_all op
+        # 3. register pre_forward_hook @isp_module to wait handle for current module
+        # 4. register post_forward_hook @isp_module to release resource
+        for embedding in self._embedding:
+            embedding.register_forward_hook(self._post_forward_hook_for_embedding)
+
+        if self.model_checkpoint:
+            if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE):
+                for head in self._head:
+                    head.register_full_backward_pre_hook(self._pre_backward_hook_for_head)
+            else:
+                self._last_block.register_forward_pre_hook(self._pre_forward_hook_for_block)
+
+        for out_proj in self._isp_outs:
+            out_proj.register_forward_pre_hook(self._pre_forward_hook_for_out_proj)
+
+        for module in self._isp_modules:
+            module.register_forward_pre_hook(self._pre_forward_hook_for_module)
+            module.register_forward_hook(self._post_forward_hook_for_module)
+
+        # register backward hooks
+        # 1. register post_backward_hook @head module to prefetch for the last block's last module
+        # 2. register pre_backward_hook @isp_module to wait handle for current module and to prefetch for next module
+        # 3. register post_backward_hook @isp_module to release resource
+        if not self.model_checkpoint:
+            for head in self._head:
+                head.register_full_backward_hook(self._post_backward_hook_for_head)
+
+            for module in self._isp_modules:
+                module.register_full_backward_pre_hook(self._pre_backward_hook_for_module)
+
+        for module in self._isp_modules:
+            module.register_full_backward_hook(self._post_backward_hook_for_module)
+
+    def _get_constant_zero(self, size: tuple) -> torch.Tensor:
+        if self.enable_memory_pool:
+            return self.memory_pool.allocate_constant_zero(size)
+        else:
+            return torch.zeros(
+                *size,
+                dtype=self.model_conf.dtype,
+                device=self.model_conf.device,
+            ).contiguous()
+
+    # communication operation interfaces
+
+    def all_gather(self, tensor: torch.Tensor, module: nn.Module, is_bias: bool = False):
+        if dist.get_world_size(self.process_group) <= 1:
+            return tensor
+
+        if not self.overlap:
+            result, _ = all_gather_raw(tensor, self.process_group, async_op=False)
+        elif is_bias:
+            result = self._bias_global_output[module]
+        else:
+            result = self._weight_global_output[module]
+
+        return result
+
+    def reduce_scatter(
+        self,
+        tensor: torch.Tensor,
+        model: nn.Module,
+        op: dist.ReduceOp,
+        is_bias: bool = False,
+    ):
+        if dist.get_world_size(self.process_group) <= 1:
+            return tensor
+
+        if not self.overlap:
+            result, handle = reduce_scatter_raw(tensor, self.process_group, op=op, async_op=True)
+        else:
+            if is_bias:
+                assert hasattr(model.bias, "isp_reduce_scatter_name")
+                key = getattr(model.bias, "isp_reduce_scatter_name")
+            else:
+                assert hasattr(model.weight, "isp_reduce_scatter_name")
+                key = getattr(model.weight, "isp_reduce_scatter_name")
+
+            self.reduce_scatter_handlers[key] = reduce_scatter_raw(
+                tensor,
+                self.process_group,
+                op=op,
+                async_op=True,
+                memory_pool_allocator=self.memory_pool.allocate_reduce_scatter_memory,
+            )
+
+            result, handle = (
+                self._get_constant_zero(
+                    (
+                        tensor.shape[0] // dist.get_world_size(self.process_group),
+                        *tensor.shape[1:],
+                    )
+                ),
+                None,
+            )
+
+        return result, handle
+
+
+class ISPCommunicatorSchedulerHook(SchedulerHook):
+    """
+    SchedulerHook for isp overlap handler
+    """
+
+    def __init__(self, overlap_handler: ISPCommunicator, zero_optim) -> None:
+        self._isp_communicator = overlap_handler
+        self._zero_optim = zero_optim
+
+    def before_forward(self, scheduler, inputs) -> None:
+        if self._isp_communicator.model_checkpoint:
+            self._isp_communicator.is_forward = True
+
+    def after_forward(self, scheduler, outputs) -> None:
+        pass
+
+    def before_criterion(self, scheduler, outputs, label) -> None:
+        pass
+
+    def after_criterion(self, scheduler, loss) -> None:
+        pass
+
+    def before_backward(self, scheduler, outputs, outputs_grad) -> None:
+        if self._isp_communicator.model_checkpoint:
+            self._isp_communicator.is_forward = False
+
+    def after_backward(self, scheduler, inputs_grad) -> None:
+        self._zero_optim.accumulate_left_grads_after_backward()
+
+    def post_helper_func(self, scheduler, outputs, label) -> None:
+        pass
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index fc5175d9..d475214f 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -12,7 +12,7 @@
 from internlm.core.context import global_context as gpc
 from internlm.model.utils import (
     Silu,
-    fstp_fused_dense_func,
+    isp_fused_dense_func,
     fused_dense_func_torch,
     megatron_fused_dense_func_torch,
 )
@@ -350,21 +350,29 @@ def __init__(
         )
 
 
-class FSTPLinear(ColumnParallelLinear):
+class ISPLinear(ColumnParallelLinear):
+    # class level communicator variable.
+    __communicator = None
+
+    @staticmethod
+    def register_communicator(communicator):
+        ISPLinear.__communicator = communicator
+
     def forward(self, x):
-        return fstp_fused_dense_func(
+        assert self.__communicator is not None, "ISPLinear should be register with a communicator first."
+
+        return isp_fused_dense_func(
             x,
             self.weight,
             self.bias,
-            process_group=self.process_group,
             module=self,
-            handler=gpc.fstp_handler,
+            communicator=self.__communicator,
         )
 
 
-class FSTPFeedForward(BaseFeedForward):
+class ISPFeedForward(BaseFeedForward):
     """
-    FeedForward in FSTP.
+    FeedForward in ISP.
 
     Args:
         in_features (int): size of each input sample
@@ -398,8 +406,8 @@ def __init__(
             device,
             dtype,
             multiple_of,
-            FSTPLinear,
-            FSTPLinear,
+            ISPLinear,
+            ISPLinear,
         )
 
 
@@ -409,7 +417,7 @@ def get_mlp_cls(tp_mode: str):
     elif tp_mode == "msp":
         mlp_cls = MegatronFeedForward
     else:
-        mlp_cls = FSTPFeedForward
+        mlp_cls = ISPFeedForward
     return mlp_cls
 
 
@@ -420,12 +428,12 @@ def get_linear_cls(tp_mode: str, parallel_mode: str):
         elif tp_mode == "msp":
             cls = MegatronColumnParallelLinearTorch
         else:
-            cls = FSTPLinear
+            cls = ISPLinear
     elif parallel_mode == "row":
         if tp_mode in ["mtp", "fsp"]:
             cls = RowParallelLinearTorch
         elif tp_mode == "msp":
             cls = MegatronRowParallelLinearTorch
         else:
-            cls = FSTPLinear
+            cls = ISPLinear
     return cls
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
deleted file mode 100644
index c81b09d0..00000000
--- a/internlm/model/overlap_handler.py
+++ /dev/null
@@ -1,391 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from typing import Any, Union
-
-import torch
-from torch import nn
-
-from internlm.core.context import ParallelMode
-from internlm.core.context import global_context as gpc
-from internlm.core.naive_amp import NaiveAMPModel
-from internlm.core.scheduler import SchedulerHook
-from internlm.model.embedding import Embedding1D
-from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
-from internlm.model.utils import (
-    all_gather_raw,
-    all_gather_raw_bias_memory_pool,
-    all_gather_raw_memory_pool,
-)
-from internlm.utils.common import get_current_device
-
-
-class FSTPOverlapHandler:
-    """
-    FSTP overlap handler for managing the all-gather and reduce_scatter overlapping.
-    """
-
-    def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None:
-        self.process_group = process_group
-        self.fstp_outs = []
-        self.fstp_modules = []
-        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
-        self.weight_global_handle = dict()  # key: fstp module; value: module global all-gather op handle
-        self.bias_global_handle = dict()  # key: fstp module; value: module bias global all-gather op handle
-        self.weight_global_output = dict()  # key: fstp module; value: module global weight after all-gather op
-        self.bias_global_output = dict()  # key: fstp module; value: module bias global weight after all-gather op
-        self.module_to_index = dict()  # key: fstp module; value: transformer block index
-        self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
-        self.last_block = None
-        self.head = []
-        self.embedding = []
-        self.model_checkpoint = gpc.config.model.checkpoint
-        self.enable_memory_pool = gpc.config.parallel["weight"].get("memory_pool", False)
-        self.is_forward = True
-
-        self.reduce_scatter_handlers = {}
-        self.zero_const_pool = {}
-
-        # just want to share same for loop for ModuleList and Module
-        if not isinstance(model, nn.ModuleList):
-            model = [model]
-
-        for _chunk in model:
-            if isinstance(_chunk, NaiveAMPModel):
-                _chunk = _chunk.model
-
-            for _chunk_name, children in _chunk.named_children():
-                if isinstance(children, ScaleColumnParallelLinear):
-                    setattr(children, "_fstp_name", "head")
-                    self.head.append(children)
-                elif isinstance(children, Embedding1D):
-                    self.embedding.append(children)
-                elif isinstance(children, nn.ModuleList):
-                    self.last_block = children[len(children) - 1]
-                    for idx, block in enumerate(children):
-                        self.index_to_fstp_modules[idx] = []
-                        for _sub_name, sub in block.named_children():
-                            for name, child in sub.named_children():
-                                if name == "out_proj":
-                                    self.fstp_outs.append(child)
-                                    self.module_to_index[child] = idx
-                                if isinstance(child, FSTPLinear):
-                                    self.module_to_index[child] = idx
-                                    self.fstp_modules.append(child)
-                                    self.index_to_fstp_modules[idx].append(child)
-
-                                    setattr(child, "_fstp_name", name)
-
-                                    _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}"
-                                    setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight")
-                                    if child.bias is not None:
-                                        setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias")
-
-        self.num_blocks = len(self.index_to_fstp_modules)
-
-        if self.enable_memory_pool:
-            self._initialize_memory_pool()
-        self._register_sync_parameters_hook()
-
-    def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor:
-        if self.enable_memory_pool:
-            if size not in self.zero_const_pool:
-                self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous()
-
-            return self.zero_const_pool[size]
-        else:
-            return torch.zeros(*size, dtype=dtype, device=device).contiguous()
-
-    def set_forward_mode(self, flag):
-        self.is_forward = flag
-
-    def _initialize_module_shape(self):
-        hidden_size = gpc.config.HIDDEN_SIZE
-        mlp_ratio = gpc.config.MLP_RATIO
-        mlp_hidden_size = int(hidden_size * mlp_ratio)
-        mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
-
-        self.module_shape["Wqkv"] = (3 * hidden_size, hidden_size)
-        self.module_shape["out_proj"] = (hidden_size, hidden_size)
-        self.module_shape["w1"] = (mlp_hidden_size, hidden_size)
-        self.module_shape["w2"] = (mlp_hidden_size, hidden_size)
-        self.module_shape["w3"] = (hidden_size, mlp_hidden_size)
-
-    def _initialize_memory_pool(self) -> None:
-        # allocate memory pool
-        self.all_gather_memory_pool = []
-        self.all_gather_bias_memory_pool = []
-        self.reduce_scatter_memory_pool = {}
-        self.module_shape = {}
-
-        self._initialize_module_shape()
-        dtype = gpc.config.model.get("dtype", torch.half)
-        device = get_current_device()
-
-        for _ in range(2):
-            weight = {}
-            for name in self.module_name:
-                weight[name] = torch.zeros(self.module_shape[name], dtype=dtype, device=device).contiguous()
-            self.all_gather_memory_pool.append(weight)  # containing two groups of block weight
-
-    def clear_memory_pool(self) -> None:
-        assert self.enable_memory_pool
-
-        self.zero_const_pool = {}
-        self.reduce_scatter_memory_pool = {}
-
-    def _get_weight_from_memory_pool(self, module):
-        assert self.enable_memory_pool
-
-        block_index = self.module_to_index[module]
-        return self.all_gather_memory_pool[block_index % 2][module._fstp_name]
-
-    def _get_bias_from_memory_pool(self, module: nn.Module):
-        assert self.enable_memory_pool
-
-        block_index = self.module_to_index[module]
-        # if the bias memory pool is empty or module has been not allocated memory
-        if len(self.all_gather_bias_memory_pool) == 0:
-            for _ in range(2):
-                weight = {}
-                weight[module._fstp_name] = torch.zeros(
-                    self.module_shape[module._fstp_name][0],
-                    dtype=gpc.config.model.get("dtype", torch.half),
-                    device=get_current_device(),
-                ).contiguous()
-                self.all_gather_bias_memory_pool.append(weight)
-        elif module._fstp_name not in self.all_gather_bias_memory_pool[0]:
-            for i in range(2):
-                self.all_gather_bias_memory_pool[i][module._fstp_name] = torch.zeros(
-                    self.module_shape[module._fstp_name][0],
-                    dtype=gpc.config.model.get("dtype", torch.half),
-                    device=get_current_device(),
-                ).contiguous()
-
-        return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
-
-    def get_weight_all_gather(self, module):
-        if self.enable_memory_pool:
-            return self._get_weight_from_memory_pool(module)
-        else:
-            return self.weight_global_output[module]
-
-    def get_bias_all_gather(self, module):
-        if self.enable_memory_pool:
-            return self._get_bias_from_memory_pool(module)
-        else:
-            return self.bias_global_output[module]
-
-    def get_reduce_scatter_memory(self, key):
-        assert self.enable_memory_pool
-
-        # if key not in dict
-        if key not in self.reduce_scatter_memory_pool:
-            self.reduce_scatter_memory_pool[key] = []
-
-        for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]):
-            if mem_item.idle is True:
-                self.reduce_scatter_memory_pool[key][index].idle = False
-                return self.reduce_scatter_memory_pool[key][index]
-
-        # if the memory pool is all used
-        cur_len = len(self.reduce_scatter_memory_pool[key])
-        self.reduce_scatter_memory_pool[key].append(
-            torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous()
-        )
-        setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False)
-        setattr(self.reduce_scatter_memory_pool[key][cur_len], "index", cur_len)
-        return self.reduce_scatter_memory_pool[key][cur_len]
-
-    def release_reduce_scatter_memory(self, key, index):
-        assert self.enable_memory_pool
-        self.reduce_scatter_memory_pool[key][index].idle = True
-
-    def _all_gather_module_weight(self, module):
-        if self.enable_memory_pool:
-            if module.bias is not None:
-                bias_handle = all_gather_raw_bias_memory_pool(
-                    module.bias,
-                    self.process_group,
-                    async_op=True,
-                    module=module,
-                )
-                self.bias_global_handle[module] = bias_handle
-
-            weight_handle = all_gather_raw_memory_pool(
-                module.weight,
-                self.process_group,
-                async_op=True,
-                module=module,
-            )
-            self.weight_global_handle[module] = weight_handle
-        else:
-            if module.bias is not None:
-                bias_output, bias_handle = all_gather_raw(
-                    module.bias,
-                    self.process_group,
-                    async_op=True,
-                )
-                self.bias_global_handle[module] = bias_handle
-                self.bias_global_output[module] = bias_output
-
-            weight_output, weight_handle = all_gather_raw(
-                module.weight,
-                self.process_group,
-                async_op=True,
-            )
-            self.weight_global_handle[module] = weight_handle
-            self.weight_global_output[module] = weight_output
-
-    def _all_gather_block_weight(self, block_index: int):
-        fstp_modules = self.index_to_fstp_modules[block_index]
-        for module in fstp_modules:
-            self._all_gather_module_weight(module)
-
-    def _register_sync_parameters_hook(self) -> None:
-        """
-        register forward hooks and backward hooks for fstp modules.
-        """
-
-        def _wait_handle(module):
-            handle = self.weight_global_handle[module]
-            handle.wait()
-            if module.bias is not None:
-                bias_handle = self.bias_global_handle[module]
-                bias_handle.wait()
-
-        def _clear_handle(module):
-            if module in self.weight_global_handle:
-                del self.weight_global_handle[module]
-            if module in self.bias_global_handle:
-                del self.bias_global_handle[module]
-
-        def _clear_weight(module):
-            if module in self.weight_global_output:
-                del self.weight_global_output[module]
-            if module in self.bias_global_output:
-                del self.bias_global_output[module]
-
-        def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
-            self._all_gather_block_weight(0)
-
-        def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any):  # pylint: disable=W0613
-            block_index = self.module_to_index[module]
-            if self.model_checkpoint and self.is_forward is False:
-                if block_index - 1 >= 0:
-                    self._all_gather_block_weight(block_index - 1)
-            else:
-                # start the all-gather for next block
-                if block_index + 1 < self.num_blocks:
-                    self._all_gather_block_weight(block_index + 1)
-
-        def _pre_forward_hook_for_module(module: nn.Module, inputs: Any):  # pylint: disable=W0613
-            if module not in self.weight_global_handle:
-                self._all_gather_module_weight(module)
-
-            _wait_handle(module)
-
-        def _pre_forward_hook_for_block(block: nn.Module, inputs: Any):  # pylint: disable=W0613
-            fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1]
-            if module in fstp_modules:
-                self._all_gather_module_weight(module)
-                _wait_handle(module)
-
-        def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any):  # pylint: disable=W0613
-            _clear_handle(module)
-            if not self.model_checkpoint:
-                _clear_weight(module)
-
-        def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output):  # pylint: disable=W0613
-            self._all_gather_module_weight(self.fstp_modules[-1])
-
-        def _pre_backward_hook_for_head(module: nn.Module, grad_output):
-            if self.is_forward is False:
-                self._all_gather_block_weight(self.num_blocks - 1)
-
-        def _pre_backward_hook_for_module(module: nn.Module, grad_output):  # pylint: disable=W0613
-            # wait handle for current module
-            if module not in self.weight_global_handle:
-                self._all_gather_module_weight(module)
-
-            _wait_handle(module)
-
-            # start the all-gather for next module
-            module_index = self.fstp_modules.index(module)
-            if module_index - 1 >= 0:
-                next_module = self.fstp_modules[module_index - 1]
-                self._all_gather_module_weight(next_module)
-
-        def _post_backward_hook_for_module(module, grad_input, grad_output):  # pylint: disable=W0613
-            _clear_handle(module)
-            _clear_weight(module)
-
-        # register forward hooks
-        # 1. register post_forward_hook @embedding module to prefetch for block 0
-        # 2. register pre_forward_hook @out_proj module to prefetch for next block,
-        #    notice that next block's all_gather op should be after current block's all_to_all op
-        # 3. register pre_forward_hook @fstp_module to wait handle for current module
-        # 4. register post_forward_hook @fstp_module to release resource
-        for embedding in self.embedding:
-            embedding.register_forward_hook(_post_forward_hook_for_embedding)
-
-        if self.model_checkpoint:
-            if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE):
-                for head in self.head:
-                    head.register_full_backward_pre_hook(_pre_backward_hook_for_head)
-            else:
-                self.last_block.register_forward_pre_hook(_pre_forward_hook_for_block)
-
-        for out_proj in self.fstp_outs:
-            out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj)
-
-        for module in self.fstp_modules:
-            module.register_forward_pre_hook(_pre_forward_hook_for_module)
-            module.register_forward_hook(_post_forward_hook_for_module)
-
-        # register backward hooks
-        # 1. register post_backward_hook @head module to prefetch for the last block's last module
-        # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module
-        # 3. register post_backward_hook @fstp_module to release resource
-        if not self.model_checkpoint:
-            for head in self.head:
-                head.register_full_backward_hook(_post_backward_hook_for_head)
-
-            for module in self.fstp_modules:
-                module.register_full_backward_pre_hook(_pre_backward_hook_for_module)
-
-        for module in self.fstp_modules:
-            module.register_full_backward_hook(_post_backward_hook_for_module)
-
-
-class FSTPOverlapSchedulerHook(SchedulerHook):
-    """
-    SchedulerHook for fstp overlap handler
-    """
-
-    def __init__(self, overlap_handler: FSTPOverlapHandler, zero_optim) -> None:
-        self._overlap_handler = overlap_handler
-        self._zero_optim = zero_optim
-
-    def before_forward(self, scheduler, inputs) -> None:
-        if self._overlap_handler.model_checkpoint:
-            self._overlap_handler.set_forward_mode(True)
-
-    def after_forward(self, scheduler, outputs) -> None:
-        pass
-
-    def before_criterion(self, scheduler, outputs, label) -> None:
-        pass
-
-    def after_criterion(self, scheduler, loss) -> None:
-        pass
-
-    def before_backward(self, scheduler, outputs, outputs_grad) -> None:
-        if self._overlap_handler.model_checkpoint:
-            self._overlap_handler.set_forward_mode(False)
-
-    def after_backward(self, scheduler, inputs_grad) -> None:
-        self._zero_optim.accumulate_left_grads_after_backward()
-
-    def post_helper_func(self, scheduler, outputs, label) -> None:
-        pass
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index a4fe3378..60e2cd99 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -1,17 +1,16 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional
+from typing import Callable, Optional
 
 import fused_dense_lib as fused_dense_cuda
 import torch
 import torch.nn.functional as F
 from flash_attn.utils.distributed import all_reduce_raw
-from torch import Tensor, nn
+from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
-from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
 
@@ -114,93 +113,77 @@ def split_forward_gather_backward(input_, parallel_mode, dim):
     return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim)
 
 
-def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0):
-    world_size = torch.distributed.get_world_size(process_group)
-    shape = list(input_.shape)
-    shape[gather_dim] = shape[gather_dim] * world_size
-    output = torch.empty(shape, dtype=input_.dtype, device=input_.device)
-    handle = torch.distributed.all_gather_into_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
-    )
-    return output, handle
-
-
-def all_gather_raw_memory_pool(
+def all_gather_raw(
     input_: Tensor,
     process_group: ProcessGroup,
     async_op: bool = False,
-    module: nn.Module = None,
+    gather_dim: int = 0,
+    memory_pool_allocator: Callable = None,
 ):
+    if memory_pool_allocator is not None:
+        output = memory_pool_allocator()
+    else:
+        world_size = torch.distributed.get_world_size(process_group)
+        shape = list(input_.shape)
+        shape[gather_dim] = shape[gather_dim] * world_size
+        output = torch.empty(shape, dtype=input_.dtype, device=input_.device)
+
     handle = torch.distributed.all_gather_into_tensor(
-        gpc.fstp_handler.get_weight_all_gather(module=module),
-        input_.contiguous(),
-        group=process_group,
-        async_op=async_op,
+        output, input_.contiguous(), group=process_group, async_op=async_op
     )
-    return handle
+    return output, handle
 
 
-def all_gather_raw_bias_memory_pool(
+def reduce_scatter_raw(
     input_: Tensor,
     process_group: ProcessGroup,
+    op=torch.distributed.ReduceOp.SUM,
     async_op: bool = False,
-    module: nn.Module = None,
-):
-    handle = torch.distributed.all_gather_into_tensor(
-        gpc.fstp_handler.get_bias_all_gather(module=module),
-        input_.contiguous(),
-        group=process_group,
-        async_op=async_op,
-    )
-    return handle
-
-
-def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
-    assert my_input.dtype == grad_output.dtype
-    grad_weight = torch.matmul(grad_output.t(), my_input)
-    grad_bias = grad_output.sum(dim=0) if has_d_bias else None
-    return grad_weight, grad_bias
-
-
-def reduce_scatter_raw(
-    input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False
+    memory_pool_allocator: Callable = None,
 ):
     world_size = torch.distributed.get_world_size(process_group)
     assert input_.shape[0] % world_size == 0
-    output = torch.empty(
-        input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
-    ).contiguous()
-    handle = torch.distributed.reduce_scatter_tensor(
-        output, input_.contiguous(), op=op, group=process_group, async_op=async_op
-    )
-    return output, handle
 
-
-def reduce_scatter_raw_memory_pool(
-    input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False
-):
-    world_size = torch.distributed.get_world_size(process_group)
-    assert input_.shape[0] % world_size == 0
-    if gpc.fstp_handler.enable_memory_pool:
+    if memory_pool_allocator is not None:
         size = (input_.shape[0] // world_size, *input_.shape[1:])
-        output = gpc.fstp_handler.get_reduce_scatter_memory(size)
+        output = memory_pool_allocator(size)
     else:
         output = torch.empty(
-            input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
+            input_.shape[0] // world_size,
+            *input_.shape[1:],
+            dtype=input_.dtype,
+            device=input_.device,
         ).contiguous()
+
     handle = torch.distributed.reduce_scatter_tensor(
         output, input_.contiguous(), op=op, group=process_group, async_op=async_op
     )
     return output, handle
 
 
+def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias):
+    assert my_input.dtype == grad_output.dtype
+    grad_weight = torch.matmul(grad_output.t(), my_input)
+    grad_bias = grad_output.sum(dim=0) if has_d_bias else None
+    return grad_weight, grad_bias
+
+
 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFunc(torch.autograd.Function):
     "FusedDenseFunc for tensor parallel in flash-attn implementation."
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0):
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        return_residual=False,
+        process_group=None,
+        sequence_parallel=True,
+        gather_dim=0,
+    ):
         """
         If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
         with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
@@ -265,7 +248,11 @@ def backward(ctx, grad_output, *args):
             if not ctx.return_residual:
                 grad_input = F.linear(grad_output, weight.t())
             else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                    grad_output,
+                    weight,
+                )
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
             if process_group is not None:
                 reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
@@ -277,7 +264,9 @@ def backward(ctx, grad_output, *args):
             if process_group is not None and sequence_parallel:
                 handle_x.wait()
             grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
-                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+                total_x.reshape(batch_dim, total_x.shape[-1]),
+                grad_output,
+                ctx.needs_input_grad[2],
             )
         else:
             grad_weight = None
@@ -296,7 +285,16 @@ class MegatronFusedDenseFunc(torch.autograd.Function):
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0):
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        return_residual=False,
+        process_group=None,
+        sequence_parallel=True,
+        gather_dim=0,
+    ):
         """
         If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
         with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
@@ -355,7 +353,11 @@ def backward(ctx, grad_output, *args):
             if not ctx.return_residual:
                 grad_input = F.linear(grad_output, weight.t())
             else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                    grad_output,
+                    weight,
+                )
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
             if process_group is not None:
                 reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
@@ -365,7 +367,9 @@ def backward(ctx, grad_output, *args):
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
             grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
-                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+                total_x.reshape(batch_dim, total_x.shape[-1]),
+                grad_output,
+                ctx.needs_input_grad[2],
             )
         else:
             grad_weight = None
@@ -405,7 +409,11 @@ def backward(ctx, grad_output, *args):
             if not ctx.return_residual:
                 grad_input = F.linear(grad_output, weight.t())
             else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                    grad_output,
+                    weight,
+                )
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
             if process_group is not None:
                 reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
@@ -418,7 +426,9 @@ def backward(ctx, grad_output, *args):
                 handle_x.wait()
             # we remove the cuda independence, which is different from flash_attn.
             grad_weight, grad_bias = linear_bias_wgrad_torch(
-                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+                total_x.reshape(batch_dim, total_x.shape[-1]),
+                grad_output,
+                ctx.needs_input_grad[2],
             )
         else:
             grad_weight = None
@@ -452,7 +462,11 @@ def backward(ctx, grad_output, *args):
             if not ctx.return_residual:
                 grad_input = F.linear(grad_output, weight.t())
             else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight)
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                    grad_output,
+                    weight,
+                )
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
             if process_group is not None:
                 reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
@@ -463,7 +477,9 @@ def backward(ctx, grad_output, *args):
             assert ctx.compute_weight_gradient
             # we remove the cuda independence, which is different from flash_attn.
             grad_weight, grad_bias = linear_bias_wgrad_torch(
-                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+                total_x.reshape(batch_dim, total_x.shape[-1]),
+                grad_output,
+                ctx.needs_input_grad[2],
             )
         else:
             grad_weight = None
@@ -473,8 +489,8 @@ def backward(ctx, grad_output, *args):
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
-class FSTPFusedDenseFunc(torch.autograd.Function):
-    "FusedDenseFunc for FSTP, which is optimized based on flash implementation."
+class ISPFusedDenseFunc(torch.autograd.Function):
+    "FusedDenseFunc for ISP, which is optimized based on flash implementation."
 
     @staticmethod
     @custom_fwd
@@ -483,247 +499,87 @@ def forward(
         x,
         weight,
         bias,
+        module,
         return_residual=False,
-        process_group=None,
-        module=None,
-        overlap_handler=None,
+        communicator=None,
+        use_flash_attn: bool = True,
     ):
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
-        ctx.process_group = process_group
-        ctx.overlap_handler = overlap_handler
         ctx.module = module
+        ctx.communicator = communicator
+        ctx.use_flash_attn = use_flash_attn
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        total_x = x.contiguous()
+        x = x.contiguous()
 
-        world_size = gpc.get_world_size(ParallelMode.WEIGHT)
-        if world_size > 1:
-            # do all_gather for weight and bias before actual computation
-            if overlap_handler is not None:
-                total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)
-            else:
-                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-                handle_weight.wait()
-
-            if bias is not None:
-                if overlap_handler is not None:
-                    total_bias = gpc.fstp_handler.get_bias_all_gather(module=module)
-                else:
-                    total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True)
-                    handle_bias.wait()
-            else:
-                total_bias = bias
-        else:
-            total_weight = weight
-            total_bias = bias
+        total_weight = communicator.all_gather(weight, module)
+        total_bias = bias if bias is None else communicator.all_gather(bias, module, is_bias=True)
 
         if torch.is_autocast_enabled():
             total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype())
-            total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+            if total_bias:
+                total_bias.to(dtype=torch.get_autocast_gpu_dtype())
 
         total_weight = total_weight.contiguous()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_shape, n = x.shape[:-1], x.shape[-1]
         batch_dim = batch_shape.numel()
         # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
         if min(batch_dim, n, *total_weight.shape) > 65535 * 32:
             raise RuntimeError("fused_dense only supports matrix dims <= 2M")
-        output = F.linear(total_x, total_weight, total_bias)
+
+        output = F.linear(x, total_weight, total_bias)
+
         # release memory
         del total_weight
         del total_bias
         if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight, bias)
+            ctx.save_for_backward(x, weight)
         else:
-            ctx.save_for_backward(weight, bias)
+            ctx.save_for_backward(weight)
         return output if not return_residual else (output, x)
 
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output, *args):
-        grad_output = grad_output.contiguous()
-        if ctx.return_residual:
-            (grad_input,) = args
-            grad_input = grad_input.contiguous()
-        process_group = ctx.process_group
-        overlap_handler = ctx.overlap_handler
         module = ctx.module
+        communicator = ctx.communicator
 
-        if ctx.compute_weight_gradient:
-            x, weight, bias = ctx.saved_tensors
-            total_x = x
-        else:
-            weight, bias = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-
-        world_size = gpc.get_world_size(ParallelMode.WEIGHT)
-        if world_size > 1:
-            if overlap_handler is not None:
-                total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)
-            else:
-                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-                handle_weight.wait()
-        else:
-            total_weight = weight
-
-        # compute weight grad
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
-                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
-            )
-            if world_size > 1:
-                if overlap_handler is not None:
-                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
-                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                    )
-                    assert hasattr(weight, "_fstp_reduce_scatter_str")
-                    overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (
-                        handle_grad_weight,
-                        grad_weight_async,
-                    )
-                    grad_weight = overlap_handler.get_zero_by_shape(
-                        (
-                            grad_weight.shape[0] // torch.distributed.get_world_size(process_group),
-                            *grad_weight.shape[1:],
-                        ),
-                        dtype=grad_weight.dtype,
-                        device=grad_weight.device,
-                    )
-                    if grad_bias is not None:
-                        grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(
-                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                        )
-                        assert hasattr(bias, "_fstp_reduce_scatter_str")
-                        overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (
-                            handle_grad_bias,
-                            grad_bias_async,
-                        )
-                        grad_bias = overlap_handler.get_zero_by_shape(
-                            (
-                                grad_bias.shape[0] // torch.distributed.get_world_size(process_group),
-                                *grad_bias.shape[1:],
-                            ),
-                            dtype=grad_bias.dtype,
-                            device=grad_bias.device,
-                        )
-                else:
-                    grad_weight, handle_grad_weight = reduce_scatter_raw(
-                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                    )
-                    if grad_bias is not None:
-                        grad_bias, handle_grad_bias = reduce_scatter_raw(
-                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                        )
-        else:
-            grad_weight = None
-            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.use_flash_attn else linear_bias_wgrad_torch
 
-        if ctx.needs_input_grad[0]:
-            if not ctx.return_residual:
-                grad_input = F.linear(grad_output, total_weight.t())
-            else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight)
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-        else:
-            grad_input = None
-        del total_weight
-
-        if ctx.needs_input_grad[1]:
-            if world_size > 1 and overlap_handler is None:
-                handle_grad_weight.wait()
-                if grad_bias is not None:
-                    handle_grad_bias.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
-
-
-class FSTPFusedDenseFuncTorch(FSTPFusedDenseFunc):
-    "FusedDenseFunc for FSTP, which is optimized based on flash implementation."
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output, *args):
         grad_output = grad_output.contiguous()
         if ctx.return_residual:
             (grad_input,) = args
             grad_input = grad_input.contiguous()
-        process_group = ctx.process_group
-        overlap_handler = ctx.overlap_handler
-        module = ctx.module
 
         if ctx.compute_weight_gradient:
-            x, weight, bias = ctx.saved_tensors
-            total_x = x
+            x, weight = ctx.saved_tensors
         else:
-            weight, bias = ctx.saved_tensors
-            total_x = None
+            x, weight = (None, *ctx.saved_tensors)
+
         batch_shape = grad_output.shape[:-1]
         batch_dim = batch_shape.numel()
         grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
 
-        world_size = gpc.get_world_size(ParallelMode.WEIGHT)
-        if world_size > 1:
-            if overlap_handler is not None:
-                total_weight = gpc.fstp_handler.get_weight_all_gather(module=module)
-            else:
-                total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True)
-                handle_weight.wait()
-        else:
-            total_weight = weight
+        total_weight = communicator.all_gather(weight, module)
 
         # compute weight grad
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
-            grad_weight, grad_bias = linear_bias_wgrad_torch(
-                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            grad_weight, grad_bias = backward_func(
+                x.reshape(batch_dim, x.shape[-1]),
+                grad_output,
+                ctx.needs_input_grad[2],
             )
-            if world_size > 1:
-                if overlap_handler is not None:
-                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(
-                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                    )
-                    assert hasattr(weight, "_fstp_reduce_scatter_str")
-                    overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (
-                        handle_grad_weight,
-                        grad_weight_async,
-                    )
-                    grad_weight = overlap_handler.get_zero_by_shape(
-                        (
-                            grad_weight.shape[0] // torch.distributed.get_world_size(process_group),
-                            *grad_weight.shape[1:],
-                        ),
-                        dtype=grad_weight.dtype,
-                        device=grad_weight.device,
-                    )
-                    if grad_bias is not None:
-                        grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(
-                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                        )
-                        assert hasattr(bias, "_fstp_reduce_scatter_str")
-                        overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (
-                            handle_grad_bias,
-                            grad_bias_async,
-                        )
-                        grad_bias = overlap_handler.get_zero_by_shape(
-                            (
-                                grad_bias.shape[0] // torch.distributed.get_world_size(process_group),
-                                *grad_bias.shape[1:],
-                            ),
-                            dtype=grad_bias.dtype,
-                            device=grad_bias.device,
-                        )
-                else:
-                    grad_weight, handle_grad_weight = reduce_scatter_raw(
-                        grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                    )
-                    if grad_bias is not None:
-                        grad_bias, handle_grad_bias = reduce_scatter_raw(
-                            grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True
-                        )
+
+            grad_weight, grad_weight_sync = communicator.reduce_scatter(
+                grad_weight, module, op=torch.distributed.ReduceOp.AVG
+            )
+            if grad_bias is not None:
+                grad_bias, grad_bias_sync = communicator.reduce_scatter(
+                    grad_bias, module, op=torch.distributed.ReduceOp.AVG, is_bias=True
+                )
         else:
             grad_weight = None
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
@@ -732,17 +588,23 @@ def backward(ctx, grad_output, *args):
             if not ctx.return_residual:
                 grad_input = F.linear(grad_output, total_weight.t())
             else:
-                grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight)
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]),
+                    grad_output,
+                    total_weight,
+                )
             grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
         else:
             grad_input = None
+
         del total_weight
 
         if ctx.needs_input_grad[1]:
-            if world_size > 1 and overlap_handler is None:
-                handle_grad_weight.wait()
-                if grad_bias is not None:
-                    handle_grad_bias.wait()
+            if grad_weight_sync:
+                grad_weight_sync.wait()
+            if grad_bias and grad_bias_sync:
+                grad_bias_sync.wait()
+
         return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
 
 
@@ -759,9 +621,25 @@ def fused_dense_func_torch(
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
+        return FusedDenseFunc.apply(
+            x,
+            weight,
+            bias,
+            return_residual,
+            process_group,
+            sequence_parallel,
+            gather_dim,
+        )
     else:
-        return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim)
+        return FusedDenseFuncTorch.apply(
+            x,
+            weight,
+            bias,
+            return_residual,
+            process_group,
+            sequence_parallel,
+            gather_dim,
+        )
 
 
 def megatron_fused_dense_func_torch(
@@ -778,30 +656,49 @@ def megatron_fused_dense_func_torch(
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
         return MegatronFusedDenseFunc.apply(
-            x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim
+            x,
+            weight,
+            bias,
+            return_residual,
+            process_group,
+            sequence_parallel,
+            gather_dim,
         )
     else:
         return MegatronFusedDenseFuncTorch.apply(
-            x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim
+            x,
+            weight,
+            bias,
+            return_residual,
+            process_group,
+            sequence_parallel,
+            gather_dim,
         )
 
 
-def fstp_fused_dense_func(
+def isp_fused_dense_func(
     x: Tensor,
     weight: Tensor,
     bias: Optional[Tensor] = None,
     return_residual: bool = False,
-    process_group=None,
     module=None,
-    handler=None,
+    communicator=None,
 ):
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler)
+        return ISPFusedDenseFunc.apply(x, weight, bias, module, return_residual, communicator)
     else:
-        return FSTPFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, module, handler)
+        return ISPFusedDenseFunc.apply(
+            x,
+            weight,
+            bias,
+            module,
+            return_residual,
+            communicator,
+            use_flash_attn=False,
+        )
 
 
 def try_import_RMSNorm():
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 681dfc9c..0f2cc1bf 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -58,6 +58,7 @@ def __init__(
         grad_scal_cfg: Config = None,
         zero_cfg: Config = None,
         param_bcast_sync_handler: ParamBcastSyncHandler = None,
+        isp_communicator = None,
     ):
         # DynamicGradScaler related args
         if gpc.config.model.dtype is torch.float32:
@@ -138,10 +139,7 @@ def __init__(
         if self._overlap_sync_param:
             assert self._param_bcast_sync_handler is not None
 
-        if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True:
-            self._fstp_handler = gpc.fstp_handler
-        else:
-            self._fstp_handler = None
+        self._isp_communicator = isp_communicator
 
         # iterate over the param group in the optimizer
         # partition these param groups for data parallel training
@@ -362,9 +360,9 @@ def extra_layernorm_reduce_grad_hook(*args):  # pylint: disable=W0613
                     ):
                         accum_grad_obj.register_hook(extra_layernorm_reduce_grad_hook)
 
-                    # we should not only register for parameters which have _fstp_reduce_scatter_str attr.
+                    # we should not only register for parameters which have isp_reduce_scatter_name attr.
                     # we must keep up with reduce_grad_hook.
-                    if self._fstp_handler is not None:
+                    if self._isp_communicator is not None:
                         accum_grad_obj.register_hook(accum_grad_hook)
 
                     if self._overlap_sync_grad:
@@ -373,7 +371,7 @@ def extra_layernorm_reduce_grad_hook(*args):  # pylint: disable=W0613
                 _define_and_attach(param, reduce_rank)
 
     def accumulate_left_grads_after_backward(self):
-        if self._fstp_handler is None:
+        if self._isp_communicator is None:
             return
 
         for group_id in range(self.num_param_groups):
@@ -395,20 +393,22 @@ def belongs_to_current_rank(self, param) -> bool:
 
     def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None:
         for _param in bucket.get_param(reduce_rank):
-            if not hasattr(_param, "_fstp_reduce_scatter_str"):
+            if not hasattr(_param, "isp_reduce_scatter_name"):
                 continue
 
             # wait and accumulate gardient.
-            _key = getattr(_param, "_fstp_reduce_scatter_str")
-            _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key]
+            _key = getattr(_param, "isp_reduce_scatter_name")
+            _grad, _comm_handle = self._isp_communicator.reduce_scatter_handlers[_key]
             _comm_handle.wait()
             _param.grad.add_(_grad)
 
             # release cuda memory.
-            if self._fstp_handler.enable_memory_pool:
-                self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index)
+            if self._isp_communicator.enable_memory_pool:
+                self._isp_communicator.memory_pool.free_reduce_scatter_memory(
+                    key=tuple(_grad.size()), index=_grad.index
+                )
             _grad = None
-            self._fstp_handler.reduce_scatter_handlers[_key] = None
+            self._isp_communicator.reduce_scatter_handlers[_key] = None
 
         bucket.reset_by_rank(reduce_rank)
 
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 587c0035..2822da5a 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -44,7 +44,8 @@
     RowParallelLinear,
 )
 from internlm.model.multi_head_attention import MHA
-from internlm.model.overlap_handler import FSTPOverlapHandler
+from internlm.model.linear import ISPLinear
+from internlm.core.communication.isp import ISPCommunicator, ISPCommModelConfig
 from internlm.model.utils import try_import_RMSNorm
 from internlm.monitor import send_heartbeat, set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
@@ -53,7 +54,7 @@
 from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
-from internlm.utils.common import DummyProfile
+from internlm.utils.common import DummyProfile, get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
@@ -175,11 +176,27 @@ def initialize_model():
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
 
-    gpc.fstp_handler = None
-    if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True:
-        gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.WEIGHT))
+    if gpc.config.parallel.tensor.mode != "isp":
+        isp_communicator = None
+    else:
+        isp_communicator = ISPCommunicator(
+            model,
+            ISPCommModelConfig(
+                gpc.config.model.hidden_size,
+                gpc.config.model.mlp_ratio,
+                gpc.config.model.dtype,
+                get_current_device(),
+                ["Wqkv", "out_proj", "w1", "w2", "w3"],
+            ),
+            gpc.config.parallel.weight.overlap,
+            gpc.config.model.checkpoint,
+            gpc.config.parallel.weight.memory_pool,
+            gpc.get_group(ParallelMode.WEIGHT),
+        )
+        # register communicator for isp linear.
+        ISPLinear.register_communicator(isp_communicator)
 
-    return model
+    return model, isp_communicator
 
 
 def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
@@ -216,7 +233,7 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
 
 
 @llm_timeout(func_name="initialize_optimizer")
-def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
+def initialize_optimizer(model: Union[nn.Module, nn.ModuleList], isp_communicator: ISPCommunicator = None):
     """
     Initialize optimizer.
 
@@ -250,6 +267,7 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
             grad_scal_cfg=gpc.config.grad_scaler,
             zero_cfg=gpc.config.hybrid_zero_optimizer,
             param_bcast_sync_handler=param_bcast_sync_handler,
+            isp_communicator=isp_communicator,
         )
     else:
         optimizer = FSDPadaptOptimizer(
diff --git a/train.py b/train.py
index b76c100d..5e048a32 100644
--- a/train.py
+++ b/train.py
@@ -19,7 +19,7 @@
 from internlm.initialize import initialize_distributed_env
 from internlm.model.loss import FlashGPTLMLoss
 from internlm.model.metrics import AccPerplex, SchedulerMetricHook
-from internlm.model.overlap_handler import FSTPOverlapSchedulerHook
+from internlm.core.communication.isp import ISPCommunicatorSchedulerHook
 from internlm.monitor import initialize_monitor_manager, send_alert_message
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.train import (
@@ -71,7 +71,7 @@ def initialize_llm_logger(start_time: str):
     return uniscale_logger
 
 
-def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]:
+def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerHook]:
     scheduler_hooks: List[SchedulerHook] = []
 
     if metric is not None:
@@ -86,8 +86,9 @@ def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]:
                 ),
             ),
         )
-    if gpc.fstp_handler is not None:
-        scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler, zero_optim))
+
+    if isp_communicator is not None:
+        scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim))
 
     return scheduler_hooks
 
@@ -133,7 +134,7 @@ def main(args):
     uniscale_logger = initialize_llm_logger(start_time=current_time)
 
     # initialize model
-    model = initialize_model()
+    model, isp_communicator = initialize_model()
 
     with open(args.config, "r") as f:
         config_lines = f.readlines()
@@ -148,7 +149,7 @@ def main(args):
     # initialize and resume train state
     train_state = TrainState(gpc.config, train_dl.batch_sampler)
 
-    optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
+    optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model, isp_communicator)
 
     ckpt_manager = CheckpointManager(
         ckpt_config=gpc.config.ckpt,
@@ -194,7 +195,7 @@ def main(args):
         train_dataloader=train_dl,
         lr_scheduler=lr_scheduler,
         beta2_scheduler=beta2_scheduler,
-        scheduler_hooks=get_scheduler_hooks(metric, optimizer),
+        scheduler_hooks=get_scheduler_hooks(metric, optimizer, isp_communicator),
     )
 
     # initialize simple memory profiler
@@ -263,8 +264,8 @@ def main(args):
                 )
             timer("fwd-bwd").stop()
 
-            if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool:
-                gpc.fstp_handler.clear_memory_pool()
+            if isp_communicator and isp_communicator.enable_memory_pool:
+                isp_communicator.memory_pool.reset_lazy_pools()
 
             # update parameters, and returns (success_update, grad_norm)
             trainer_result = trainer.step()

From fe6fed722a5f2282d839147ab46ecb15ad42c7cf Mon Sep 17 00:00:00 2001
From: "chenxun.p" <759046501@qq.com>
Date: Thu, 28 Dec 2023 11:11:24 +0800
Subject: [PATCH 092/153] feat(*): fix bug

---
 internlm/solver/optimizer/hybrid_zero_optim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 0f2cc1bf..414b402f 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -58,7 +58,7 @@ def __init__(
         grad_scal_cfg: Config = None,
         zero_cfg: Config = None,
         param_bcast_sync_handler: ParamBcastSyncHandler = None,
-        isp_communicator = None,
+        isp_communicator=None,
     ):
         # DynamicGradScaler related args
         if gpc.config.model.dtype is torch.float32:
@@ -362,7 +362,7 @@ def extra_layernorm_reduce_grad_hook(*args):  # pylint: disable=W0613
 
                     # we should not only register for parameters which have isp_reduce_scatter_name attr.
                     # we must keep up with reduce_grad_hook.
-                    if self._isp_communicator is not None:
+                    if self._isp_communicator and self._isp_communicator.overlap:
                         accum_grad_obj.register_hook(accum_grad_hook)
 
                     if self._overlap_sync_grad:

From a80fbe3ab7ae65c7c6c602da99a520a31c7d544d Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 12 Jan 2024 11:18:00 +0800
Subject: [PATCH 093/153] fix(train/utils.py): fix zp size cheak and
 embed_param group

---
 internlm/core/context/parallel_context.py     | 28 +++++++++++++++----
 .../core/scheduler/no_pipeline_scheduler.py   |  8 +++++-
 internlm/initialize/launch.py                 |  5 +++-
 internlm/model/overlap_handler.py             |  2 ++
 internlm/model/utils.py                       |  1 +
 internlm/train/utils.py                       |  6 ++--
 train.py                                      |  1 +
 7 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 6e7efaae..56a7d715 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -410,17 +410,27 @@ def check_sanity(self):
             AssertionError: Raises an AssertionError if the world size does not equal to the product
                 of data parallel size, pipeline parallel size and tensor parallel size.
         """
+        # for mtp/msp/fsp
         dps = self.data_parallel_size
         pps = self.pipeline_parallel_size
         tps = self.tensor_parallel_size
         ws = self.world_size
-        # assert ws == dps * pps * tps, (
-        #     f"Expected the world size {ws} to be equal to data"
-        #     f" parallel size ({dps}) * pipeline parallel size "
-        #     f"({pps}) * tensor parallel size ({tps})"
-        # )
+        assert ws == dps * pps * tps, (
+            f"Expected the world size {ws} to be equal to data"
+            f" parallel size ({dps}) * pipeline parallel size "
+            f"({pps}) * tensor parallel size ({tps})"
+        )
+
+        # for isp
+        wps = self.weight_parallel_size
+        wdps = self.weight_data_parallel_size
+        assert ws == wdps * pps * wps, (
+            f"Expected the world size {ws} to be equal to weight data"
+            f" parallel size ({wdps}) * pipeline parallel size "
+            f"({pps}) * weight parallel size ({wps})"
+        )
+
         assert self.zero1_parallel_size > 0
-        assert self.data_parallel_size % self.zero1_parallel_size == 0
 
         # check for fsdp:
         # if zo_size < dp_size, ckpts saving will introduce redundent storage for model weights
@@ -470,10 +480,16 @@ def init_parallel_groups(self):
             assert (
                 self.zero1_parallel_size <= self.data_parallel_size
             ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}"
+            assert (
+                self.data_parallel_size % self.zero1_parallel_size == 0
+            ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0"
         else:
             assert (
                 self.zero1_parallel_size <= self.weight_data_parallel_size
             ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}"
+            assert (
+                self.weight_data_parallel_size % self.zero1_parallel_size == 0
+            ), f"weight_data_parallel_size:{self.weight_data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0"
 
         # the recommended nettest_parallel_size is 32 GPUs
         self.nettest_parallel_size = 32
diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index 56661d8c..6e8454ff 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -89,6 +89,7 @@ def _train_one_batch(
         engine: Engine,
         forward_only: bool = False,
         return_loss: bool = True,
+        return_output: bool = False,
         scale_loss: int = 1,
     ):
         """Trains one batch of data.
@@ -100,6 +101,7 @@ def _train_one_batch(
             forward_only (bool, optional): If True, the model is run for the forward pass, else back propagation will
                 be executed.
             return_loss (bool, optional): Loss will be returned if True.
+            return_output (bool, optional): Output will be returned if True.
             scale_loss (int, optional): The scale factor for the loss.
         """
 
@@ -128,6 +130,10 @@ def _train_one_batch(
                 loss /= scale_loss
                 loss += moe_loss
 
+        # clear output before backward for releasing memory resource
+        if not return_output:
+            output = None
+
         # backward
         if not forward_only:
             self._call_hooks("before_backward", None, None)
@@ -192,7 +198,7 @@ def forward_backward_step(
             _data, _label = self._load_accum_batch(data, label)
 
             _output, _loss, _moe_loss = self._train_one_batch(
-                _data, _label, engine, forward_only, return_loss, self._grad_accum_size
+                _data, _label, engine, forward_only, return_loss, return_output_label, self._grad_accum_size
             )
 
             if return_loss:
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index af4c9698..9d6ab323 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -450,7 +450,10 @@ def launch(
             )
 
     print(
-        f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} weight_dp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}",
+        f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} "
+        f"sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} pp_rank:{gpc.get_local_rank(ParallelMode.PIPELINE)} "
+        f"zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} "
+        f"wdp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}",
         flush=True,
     )
 
diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py
index a5649eae..2595d04e 100644
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@@ -386,6 +386,8 @@ def before_backward(self, scheduler, outputs, outputs_grad) -> None:
 
     def after_backward(self, scheduler, inputs_grad) -> None:
         self._zero_optim.accumulate_left_grads_after_backward()
+        if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool:
+            gpc.fstp_handler.clear_memory_pool()
 
     def post_helper_func(self, scheduler, outputs, label) -> None:
         pass
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index a4fe3378..fdd457c2 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -34,6 +34,7 @@ def _split(input_, parallel_mode, dim=-1):
     tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
     rank = gpc.get_local_rank(parallel_mode)
     output = tensor_list[rank].contiguous()
+    output = output.detach().clone()
 
     return output
 
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 7ef0cb81..58880bb8 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -138,9 +138,11 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         pgroup["params"] = origin_params
         pgroup["optimizer_mode"] = ParallelMode.ZERO1
 
-    # param groups may contain empty groups, such as fp32
-    if len(new_groups["embed_head"]["params"]) > 0:
+    # param groups may contain empty groups, such as embed_head
+    if gpc.config.parallel.tensor.mode == "isp":
         param_groups.extend(new_groups.values())
+    else:
+        assert len(new_groups["embed_head"]["params"]) <= 0
 
     # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True)
     # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True)
diff --git a/train.py b/train.py
index b76c100d..6eabd8f8 100644
--- a/train.py
+++ b/train.py
@@ -204,6 +204,7 @@ def main(args):
             optimizer.optim,
             log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
             + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+            + f"wp{gpc.get_local_rank(ParallelMode.WEIGHT)}_"
             + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
         )
     else:

From 1aebcd99a1793d486d9be3ef28c9cf8d7d0f5c16 Mon Sep 17 00:00:00 2001
From: "chenxun.p" <759046501@qq.com>
Date: Fri, 12 Jan 2024 16:07:14 +0800
Subject: [PATCH 094/153] fix(model/util): force to pass communictor

---
 internlm/model/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 25e84804..9b82039b 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -501,8 +501,8 @@ def forward(
         weight,
         bias,
         module,
+        communicator,
         return_residual=False,
-        communicator=None,
         use_flash_attn: bool = True,
     ):
         ctx.compute_weight_gradient = weight.requires_grad
@@ -680,10 +680,10 @@ def megatron_fused_dense_func_torch(
 def isp_fused_dense_func(
     x: Tensor,
     weight: Tensor,
+    module,
+    communicator,
     bias: Optional[Tensor] = None,
     return_residual: bool = False,
-    module=None,
-    communicator=None,
 ):
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
@@ -708,7 +708,8 @@ def try_import_RMSNorm():
 
     """
     try:
-        from apex.normalization.fused_layer_norm import MixedFusedRMSNorm as RMSNorm
+        from apex.normalization.fused_layer_norm import \
+            MixedFusedRMSNorm as RMSNorm
 
         return RMSNorm
     except ModuleNotFoundError:

From 917ab0d214d8199d647e8cb725f2e014acf3af2c Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 12 Jan 2024 17:57:55 +0800
Subject: [PATCH 095/153] fix(model/utils.py): fix param set

---
 internlm/model/utils.py                        | 7 +++----
 internlm/solver/optimizer/hybrid_zero_optim.py | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 9b82039b..1e6d76b0 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -689,15 +689,15 @@ def isp_fused_dense_func(
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
     if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return ISPFusedDenseFunc.apply(x, weight, bias, module, return_residual, communicator)
+        return ISPFusedDenseFunc.apply(x, weight, bias, module, communicator, return_residual)
     else:
         return ISPFusedDenseFunc.apply(
             x,
             weight,
             bias,
             module,
-            return_residual,
             communicator,
+            return_residual,
             use_flash_attn=False,
         )
 
@@ -708,8 +708,7 @@ def try_import_RMSNorm():
 
     """
     try:
-        from apex.normalization.fused_layer_norm import \
-            MixedFusedRMSNorm as RMSNorm
+        from apex.normalization.fused_layer_norm import MixedFusedRMSNorm as RMSNorm
 
         return RMSNorm
     except ModuleNotFoundError:
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 414b402f..3e3f5085 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -371,7 +371,7 @@ def extra_layernorm_reduce_grad_hook(*args):  # pylint: disable=W0613
                 _define_and_attach(param, reduce_rank)
 
     def accumulate_left_grads_after_backward(self):
-        if self._isp_communicator is None:
+        if self._isp_communicator is None or self._isp_communicator.overlap is False:
             return
 
         for group_id in range(self.num_param_groups):

From b77787f445b64242fa4c8429450b5889ac9636b0 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 12 Jan 2024 18:52:47 +0800
Subject: [PATCH 096/153] fix(hybrid_zero_optim.py): fix reduce scatter error
 when wp_size=1

---
 internlm/core/communication/isp.py             | 3 ++-
 internlm/core/context/parallel_context.py      | 7 ++++---
 internlm/initialize/launch.py                  | 1 -
 internlm/model/linear.py                       | 2 +-
 internlm/model/multi_head_attention.py         | 1 -
 internlm/solver/optimizer/hybrid_zero_optim.py | 6 +++++-
 6 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index cf19fbe5..eee38c5b 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -23,6 +23,7 @@ class ISPCommModelConfig:
     """
     model config for isp communicator.
     """
+
     hidden_size: int = 0
     mlp_ratio: float = 0
     dtype: torch.dtype = torch.half
@@ -435,7 +436,7 @@ def reduce_scatter(
         is_bias: bool = False,
     ):
         if dist.get_world_size(self.process_group) <= 1:
-            return tensor
+            return tensor, None
 
         if not self.overlap:
             result, handle = reduce_scatter_raw(tensor, self.process_group, op=op, async_op=True)
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 56a7d715..826b51a1 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -487,9 +487,10 @@ def init_parallel_groups(self):
             assert (
                 self.zero1_parallel_size <= self.weight_data_parallel_size
             ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}"
-            assert (
-                self.weight_data_parallel_size % self.zero1_parallel_size == 0
-            ), f"weight_data_parallel_size:{self.weight_data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0"
+            assert self.weight_data_parallel_size % self.zero1_parallel_size == 0, (
+                f"weight_data_parallel_size:{self.weight_data_parallel_size} % "
+                f"zero1_parallel_size: {self.zero1_parallel_size} != 0"
+            )
 
         # the recommended nettest_parallel_size is 32 GPUs
         self.nettest_parallel_size = 32
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 9d6ab323..eedb0e65 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -8,7 +8,6 @@
 from typing import Dict, Union
 
 import torch
-from torch.distributed import get_rank
 
 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index d475214f..ed21a21b 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -364,9 +364,9 @@ def forward(self, x):
         return isp_fused_dense_func(
             x,
             self.weight,
-            self.bias,
             module=self,
             communicator=self.__communicator,
+            bias=self.bias,
         )
 
 
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index fb0309a5..eba5a6f1 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -35,7 +35,6 @@
 from torch import Tensor, nn
 from torch.nn import Module
 
-from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding
 from internlm.model.linear import get_linear_cls
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 3e3f5085..f7ce3bdc 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -362,7 +362,11 @@ def extra_layernorm_reduce_grad_hook(*args):  # pylint: disable=W0613
 
                     # we should not only register for parameters which have isp_reduce_scatter_name attr.
                     # we must keep up with reduce_grad_hook.
-                    if self._isp_communicator and self._isp_communicator.overlap:
+                    if (
+                        self._isp_communicator
+                        and self._isp_communicator.overlap
+                        and gpc.config.parallel.weight.size > 1
+                    ):
                         accum_grad_obj.register_hook(accum_grad_hook)
 
                     if self._overlap_sync_grad:

From 594d61db6c474a8e978074644b7000486da04f2d Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 15 Jan 2024 15:38:36 +0800
Subject: [PATCH 097/153] feat(model_checkpoint.py): model and optimizer
 save/load ckpt adapt to isp

---
 internlm/utils/model_checkpoint.py | 145 ++++++++++++++++++++---------
 1 file changed, 101 insertions(+), 44 deletions(-)

diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
index 4b3f7d5b..fa9f50df 100644
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@@ -254,9 +254,13 @@ def save_model_checkpoint(folder, model):
     - folder
         - model_tp{tp_rank}_pp{pp_rank}.pt
 
+    If tensor parallel mode is isp, the saved weight is named:
+    - folder
+        - model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt
+
     If fsdp is activated, the saved weight is named:
     - folder
-        - model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}
+        - model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}.pt
 
     If the tp is inconsistent with the saved one in the future use, the weight needs to be converted before loading.
 
@@ -277,39 +281,54 @@ def save_model_checkpoint(folder, model):
     if folder is not None:
         dp_size = gpc.get_world_size(ParallelMode.DATA)
         tp_size = gpc.get_world_size(ParallelMode.TENSOR)
+        wp_size = gpc.get_world_size(ParallelMode.WEIGHT)
         dp_rank = gpc.get_local_rank(ParallelMode.DATA)
         tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+        wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
         pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+        wdp_rank = gpc.get_local_rank(ParallelMode.WEIGHT_DATA)
 
         # TODO In theory, we should also consider pp level, but since pp is generally a state across machines,
         # even if pp is not considered, it will definitely not be written on the same machine.
-        should_save_rank_pair = set()  # (tp_rank, dp_rank)
-        for i in range(tp_size):
-            if gpc.config.parallel.zero1.fsdp:
-                for j in range(dp_size):
-                    should_save_rank_pair.add((i, j))
-            else:
-                should_save_rank_pair.add((i, i % dp_size))
 
-            if (tp_rank, dp_rank) in should_save_rank_pair:
-                f_dp = f"_dp{dp_rank}" if gpc.config.parallel.zero1.fsdp else ""
-                fn = f"model_tp{tp_rank}_pp{pp_rank}{f_dp}.pt"
+        # for tensor parallel mode with isp
+        if gpc.config.parallel.tensor.mode == "isp":
+            if wdp_rank == 0 or dp_rank == 0:
+                fn = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt"
                 fp = os.path.join(folder, fn)
                 llm_save(fp, saved_obj=states)
-                if not gpc.config.parallel.zero1.fsdp or dp_rank == tp_rank % dp_size:
-                    topo_fn = f"topo_tp{tp_rank}_pp{pp_rank}.json"
-                    topo_fp = os.path.join(folder, topo_fn)
-                    llm_save(topo_fp, saved_obj=topo)
+                topo_fn = f"topo_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.json"
+                topo_fp = os.path.join(folder, topo_fn)
+                llm_save(topo_fp, saved_obj=topo)
+        else:
+            # for tensor parallel mode with mtp/msp/fsp
+            should_save_rank_pair = set()  # (tp_rank, dp_rank)
+            for i in range(tp_size):
+                if gpc.config.parallel.zero1.fsdp:
+                    for j in range(dp_size):
+                        should_save_rank_pair.add((i, j))
+                else:
+                    should_save_rank_pair.add((i, i % dp_size))
+
+                if (tp_rank, dp_rank) in should_save_rank_pair:
+                    f_dp = f"_dp{dp_rank}" if gpc.config.parallel.zero1.fsdp else ""
+                    fn = f"model_tp{tp_rank}_pp{pp_rank}{f_dp}.pt"
+                    fp = os.path.join(folder, fn)
+                    llm_save(fp, saved_obj=states)
+                    if not gpc.config.parallel.zero1.fsdp or dp_rank == tp_rank % dp_size:
+                        topo_fn = f"topo_tp{tp_rank}_pp{pp_rank}.json"
+                        topo_fp = os.path.join(folder, topo_fn)
+                        llm_save(topo_fp, saved_obj=topo)
 
         # try to save expert parameter to separate files if model have moe layer
-        expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA)
-        expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA)
-        should_save_rank_pair.clear()
-        for i in range(tp_size):
-            should_save_rank_pair.add((i, i % expert_dp_size))
+        # expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA)
+        # expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA)
+        # should_save_rank_pair.clear()
+        # for i in range(tp_size):
+        #     should_save_rank_pair.add((i, i % expert_dp_size))
 
-        if (tp_rank, expert_dp_rank) in should_save_rank_pair:
-            try_save_moe_checkpoint(folder, model, tp_rank, pp_rank)
+        # if (tp_rank, expert_dp_rank) in should_save_rank_pair:
+        #     try_save_moe_checkpoint(folder, model, tp_rank, pp_rank)
 
     torch.distributed.barrier()
 
@@ -328,9 +347,11 @@ def load_model_checkpoint(folder, model):
     """
 
     tp_size = gpc.get_world_size(ParallelMode.TENSOR)
+    wp_size = gpc.get_world_size(ParallelMode.WEIGHT)
     pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
     dp_size = gpc.get_world_size(ParallelMode.DATA)
     tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+    wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
     pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
     dp_rank = gpc.get_local_rank(ParallelMode.DATA)
 
@@ -342,11 +363,15 @@ def load_model_checkpoint(folder, model):
         "_dp" not in test_fn and not gpc.config.parallel.zero1.fsdp
     ), "FSDP model wants to load no-FSDP ckpts or reverse"
 
-    max_pp, max_tp, max_zo = 0, 0, 0
+    max_pp, max_wp, max_tp, max_zo = 0, 0, 0, 0
     for fn in fns:
         if fn.startswith("model_t") and not fn.endswith(".md5"):
             segements = os.path.splitext(fn)[0].split("_")
-            if gpc.config.parallel.zero1.fsdp:
+            if gpc.config.parallel.tensor.mode == "isp":
+                max_pp = max(max_pp, int(segements[-1][2:]))
+                max_wp = max(max_wp, int(segements[-2][2:]))
+                max_tp = max(max_tp, int(segements[-3][2:]))
+            elif gpc.config.parallel.zero1.fsdp:
                 max_zo = max(max_zo, int(segements[-1][2:]))
                 max_pp = max(max_pp, int(segements[-2][2:]))
                 max_tp = max(max_tp, int(segements[-3][2:]))
@@ -357,6 +382,9 @@ def load_model_checkpoint(folder, model):
     assert (
         pp_size == max_pp + 1
     ), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"
+    assert (
+        wp_size == max_wp + 1
+    ), f"The weights are save for {max_wp+1} parallelism, while current has {wp_size} weight parallelism"
     assert (
         tp_size == max_tp + 1
     ), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism"
@@ -365,7 +393,9 @@ def load_model_checkpoint(folder, model):
             dp_size == max_zo + 1
         ), f"The weights are save for {max_zo+1} FSDP shards , while current has {dp_size} FSDP shards"
 
-    if gpc.config.parallel.zero1.fsdp:
+    if gpc.config.parallel.tensor.mode == "isp":
+        should_load_name = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt"
+    elif gpc.config.parallel.zero1.fsdp:
         should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
     else:
         should_load_name = f"model_tp{tp_rank}_pp{pp_rank}.pt"
@@ -466,20 +496,26 @@ def save_optimizer_checkpoint(optim, state_path):
     # TODO sanity check for optimizer type
     zero_rank = gpc.get_local_rank(ParallelMode.ZERO1)
     tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+    wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
     pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    dp_rank = gpc.get_local_rank(ParallelMode.DATA)
     zero_size = gpc.get_world_size(ParallelMode.ZERO1)
     tp_size = gpc.get_world_size(ParallelMode.TENSOR)
-    pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
-    fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt"
+    dp_size = gpc.get_world_size(ParallelMode.DATA)
 
     states = optim.state_dict()
     if isinstance(optim, HybridZeroOptimizer):
-        if gpc.get_global_rank() < zero_size * tp_size * pp_size:
+        if gpc.config.parallel.tensor.mode == "isp":
+            fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
             llm_save(os.path.join(state_path, fp), states)
-            if "zero_devide_optim_plan" in states:
-                params_per_rank_id_dict = states.pop("zero_devide_optim_plan")
-                fp_meta = os.path.join(state_path, optim.rank_unique_id)
-                llm_save(fp_meta, params_per_rank_id_dict)
+        else:
+            fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt"
+            if (gpc.get_global_rank() % (tp_size * dp_size)) < zero_size * tp_size:
+                llm_save(os.path.join(state_path, fp), states)
+        if "zero_devide_optim_plan" in states:
+            params_per_rank_id_dict = states.pop("zero_devide_optim_plan")
+            fp_meta = os.path.join(state_path, optim.rank_unique_id)
+            llm_save(fp_meta, params_per_rank_id_dict)
     else:
         llm_save(os.path.join(state_path, fp), states)
 
@@ -516,32 +552,53 @@ def load_optimizer_checkpoint(folder, optim):
     """
 
     fns = get_fns(folder)
-    max_tp, max_pp, max_zero = 0, 0, 0
+    max_tp, max_wp, max_pp, max_zero, max_dp = 0, 0, 0, 0, 0
     for fn in fns:
         if fn.startswith("optimizer_") and not fn.endswith(".md5"):
-            _, tp, pp, zero = os.path.splitext(fn)[0].split("_")
-            max_zero = max(max_zero, int(zero[2:]))
-            max_tp = max(max_tp, int(tp[2:]))
-            max_pp = max(max_pp, int(pp[2:]))
+            if gpc.config.parallel.tensor.mode == "isp":
+                _, tp, wp, pp, dp = os.path.splitext(fn)[0].split("_")
+                max_dp = max(max_dp, int(dp[2:]))
+                max_tp = max(max_tp, int(tp[2:]))
+                max_wp = max(max_wp, int(wp[2:]))
+                max_pp = max(max_pp, int(pp[2:]))
+            else:
+                _, tp, pp, zero = os.path.splitext(fn)[0].split("_")
+                max_zero = max(max_zero, int(zero[2:]))
+                max_tp = max(max_tp, int(tp[2:]))
+                max_pp = max(max_pp, int(pp[2:]))
 
     zero_size = gpc.get_world_size(ParallelMode.ZERO1)
-    zero_rank = gpc.get_local_rank(ParallelMode.ZERO1)
     tp_size = gpc.get_world_size(ParallelMode.TENSOR)
+    wp_size = gpc.get_world_size(ParallelMode.WEIGHT)
     pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
+    dp_size = gpc.get_world_size(ParallelMode.DATA)
 
+    assert (
+        dp_size == max_dp + 1
+    ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism"
     assert (
         zero_size == max_zero + 1
-    ), f"The weights are save for {max_zero+1} data parallel, while current has {zero_size} zero broadcast range."
+    ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range."
     assert (
         pp_size == max_pp + 1
-    ), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"
+    ), f"The optimizer states are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"
     assert (
         tp_size == max_tp + 1
-    ), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism"
+    ), f"The optimizer states are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism"
+    assert (
+        wp_size == max_wp + 1
+    ), f"The optimizer states are save for {max_wp+1} parallelism, while current has {wp_size} weight parallelism"
+
+    zero_rank = gpc.get_local_rank(ParallelMode.ZERO1)
+    tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+    wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
+    pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+    dp_rank = gpc.get_local_rank(ParallelMode.DATA)
+    if gpc.config.parallel.tensor.mode == "isp":
+        fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
+    else:
+        fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt"
 
-    fp = f"optimizer_tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
-    fp += f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}_"
-    fp += f"zo{zero_rank}.pt"
     states = llm_load(os.path.join(folder, fp), map_location=get_current_device())
 
     if isinstance(optim, HybridZeroOptimizer):

From e4d1ff89a101e5fc9386436af7bce5ecd2b4b6fc Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 16 Jan 2024 15:55:36 +0800
Subject: [PATCH 098/153] fix(model_checkpoint.py): fix dp/zo size check

---
 internlm/utils/model_checkpoint.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
index fa9f50df..5c21af90 100644
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@@ -573,12 +573,14 @@ def load_optimizer_checkpoint(folder, optim):
     pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
     dp_size = gpc.get_world_size(ParallelMode.DATA)
 
-    assert (
-        dp_size == max_dp + 1
-    ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism"
-    assert (
-        zero_size == max_zero + 1
-    ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range."
+    if gpc.config.parallel.tensor.mode == "isp":
+        assert (
+            dp_size == max_dp + 1
+        ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism"
+    if gpc.config.parallel.tensor.mode != "isp":
+        assert (
+            zero_size == max_zero + 1
+        ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range."
     assert (
         pp_size == max_pp + 1
     ), f"The optimizer states are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"

From f2f88a773d5f95ee01a3234efc3c3c4ab9a3f4e4 Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Wed, 27 Dec 2023 12:03:30 +0800
Subject: [PATCH 099/153] support sequence parallel for moe

---
 .../core/scheduler/no_pipeline_scheduler.py   |  6 +++
 internlm/core/scheduler/pipeline_scheduler.py | 11 ++--
 internlm/model/modeling_moe.py                | 52 ++++++++++++-------
 internlm/moe/sharded_moe.py                   |  4 +-
 4 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index 6e8454ff..2092548d 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -6,7 +6,9 @@
 from typing import Any, Callable, Iterable, List, Optional
 
 import torch
+import torch.distributed as dist
 
+from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
 from internlm.utils.common import conditional_context
@@ -126,6 +128,10 @@ def _train_one_batch(
                     if hasattr(gpc.config.model, "num_experts")
                     else torch.tensor(0.0, device=torch.cuda.current_device(), dtype=gpc.config.model.get("dtype"))
                 )
+                # the moe_loss is computed among the "tensor" group if sequence parallel is enabled,
+                # so we need to do allreduce
+                if gpc.config.parallel.sequence_parallel:
+                    dist.all_reduce(moe_loss, op=dist.ReduceOp.AVG, group=gpc.get_group(ParallelMode.TENSOR))
                 moe_loss /= scale_loss
                 loss /= scale_loss
                 loss += moe_loss
diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py
index 622c91f6..7d8fd3fd 100644
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@@ -133,10 +133,7 @@ def __init__(
             tensor_shape if tensor_shape is None or isinstance(tensor_shape, torch.Size) else torch.Size(tensor_shape)
         )
 
-        self.scatter_gather_tensors = (
-            scatter_gather_tensors
-            and gpc.is_using_parallel_mode(ParallelMode.TENSOR)
-        )
+        self.scatter_gather_tensors = scatter_gather_tensors and gpc.is_using_parallel_mode(ParallelMode.TENSOR)
 
         if gpc.config.parallel.sequence_parallel:
             self.scatter_gather_tensors = False
@@ -293,6 +290,9 @@ def _forward_step(
             if hasattr(gpc.config.model, "num_experts")
             else torch.tensor(0.0, device=torch.cuda.current_device(), dtype=gpc.config.model.get("dtype"))
         )
+        # the moe_loss is computed among the "tensor" group if sequence parallel is enabled, so we need to do allreduce
+        if gpc.config.parallel.sequence_parallel:
+            dist.all_reduce(moe_loss, op=dist.ReduceOp.AVG, group=gpc.get_group(ParallelMode.TENSOR))
         moe_loss /= self.num_microbatches
         accum_moe_loss.add_(moe_loss.detach())
 
@@ -840,6 +840,9 @@ def _forward_step(self, engine, chunk_id):
             if hasattr(gpc.config.model, "num_experts")
             else torch.tensor(0.0, device=torch.cuda.current_device(), dtype=gpc.config.model.get("dtype"))
         )
+        # the moe_loss is computed among the "tensor" group if sequence parallel is enabled, so we need to do allreduce
+        if gpc.config.parallel.sequence_parallel:
+            dist.all_reduce(moe_loss, op=dist.ReduceOp.AVG, group=gpc.get_group(ParallelMode.TENSOR))
         moe_loss /= self.num_microbatches
 
         if self._accum_moe_loss is not None:
diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
index 9d9f3238..6c8f414d 100644
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@@ -15,13 +15,18 @@
 from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
-    FeedForward,
+    MegatronScaleColumnParallelLinear,
     RewardModelLinear,
     ScaleColumnParallelLinear,
+    get_mlp_cls,
 )
 from internlm.model.moe import MoE
 from internlm.model.multi_head_attention import MHA
-from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm
+from internlm.model.utils import (
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+    try_import_RMSNorm,
+)
 from internlm.solver.pipeline_utils import partition_uniform
 from internlm.utils.checkpoint import activation_checkpoint
 from internlm.utils.common import filter_kwargs
@@ -94,6 +99,7 @@ def __init__(
         moe_drop_tokens: bool = True,
         moe_use_rts: bool = True,
         moe_use_residual: bool = False,
+        tp_mode: str = "mtp",
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -103,10 +109,13 @@ def __init__(
         self.use_flash_attn = use_flash_attn
 
         head_dim = hidden_size // num_attention_heads
+        self.tp_mode = tp_mode
+        parallel_mode = ParallelMode.WEIGHT if self.tp_mode == "isp" else ParallelMode.TENSOR
         self.mixer = MHA(
             embed_dim=hidden_size,
             num_heads=num_attention_heads,
-            process_group=gpc.get_group(ParallelMode.TENSOR),
+            process_group=gpc.get_group(parallel_mode),
+            sequence_process_group=gpc.get_group(ParallelMode.TENSOR),
             dropout=attn_drop_rate,
             max_position_embeddings=max_position_embeddings,
             softmax_scale=1 / math.sqrt(head_dim),
@@ -118,6 +127,7 @@ def __init__(
             use_flash_attn=use_flash_attn,
             device=device,
             dtype=dtype,
+            tp_mode=self.tp_mode,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -147,11 +157,12 @@ def __init__(
         ep_size = gpc.get_world_size(ParallelMode.EXPERT)
         if num_experts <= 1:  # dense, not MoE
             if use_swiglu:
-                self.mlp = FeedForward(
+                mlp_cls = get_mlp_cls(self.tp_mode)
+                self.mlp = mlp_cls(
                     hidden_size,
                     int(hidden_size * mlp_ratio),
                     out_features=hidden_size,
-                    process_group=gpc.get_group(ParallelMode.TENSOR),
+                    process_group=gpc.get_group(parallel_mode),
                     bias=False,
                     device=device,
                     dtype=dtype,
@@ -162,7 +173,7 @@ def __init__(
                     int(hidden_size * mlp_ratio),
                     out_features=hidden_size,
                     activation="gelu_approx",
-                    process_group=gpc.get_group(ParallelMode.TENSOR),
+                    process_group=gpc.get_group(parallel_mode),
                     bias1=False,
                     bias2=False,
                     sequence_parallel=gpc.config.model.sequence_parallel,
@@ -171,9 +182,6 @@ def __init__(
                     device=device,
                     dtype=dtype,
                 )
-            for _, param in self.mlp.named_parameters():
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
         else:
             # replace mlp by MoE module. The expert in MoE is a FeedForward module.
             self.mlp = MoE(
@@ -191,9 +199,6 @@ def __init__(
                 device=device,
                 dtype=dtype,
             )
-            for _, param in self.mlp.moe_layer.experts.named_parameters():
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
             set_fp32_attr_to_module(self.mlp.moe_layer.gate)
 
         self.dropout2 = nn.Dropout(drop_rate)
@@ -374,11 +379,16 @@ def __init__(
         super().__init__()
 
         checkpoint_layer_num = int(num_layers * checkpoint)
+        self.tp_mode = gpc.config.parallel.tensor.mode
 
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            head_cls = ScaleColumnParallelLinear
+            head_cls = (
+                ScaleColumnParallelLinear
+                if self.tp_mode in ["mtp", "fsp", "isp"]
+                else MegatronScaleColumnParallelLinear
+            )
         if first:
             if embed_split_hidden:
                 self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
@@ -395,8 +405,6 @@ def __init__(
                 )
             for _, param in self.embedding.named_parameters():
                 normal_(std=0.0052)(param)
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
         self.embed_grad_scale = embed_grad_scale
         self.blocks = nn.ModuleList(
             [
@@ -428,6 +436,7 @@ def __init__(
                     moe_drop_tokens=moe_drop_tokens,
                     moe_use_rts=moe_use_rts,
                     moe_use_residual=moe_use_residual,
+                    tp_mode=self.tp_mode,
                 )
                 for lid in range(num_layers)
             ]
@@ -449,8 +458,7 @@ def __init__(
             )
             for _, param in self.head.named_parameters():
                 normal_(std=0.0052)(param)
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
+
         self.parallel_output = parallel_output
 
     def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
@@ -475,6 +483,10 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             assert len(indexes) == 1
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
+            # if the sequence parallel mode is 'isp', the indexes should also be split in sequence dimension.
+            if gpc.config.parallel.sequence_parallel and self.tp_mode == "isp":
+                indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
+
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
 
         moe_losses = []
@@ -491,7 +503,11 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
         if hasattr(self, "norm"):
             hidden_states = self.norm(hidden_states.float())
         if hasattr(self, "head"):
-            hidden_states = self.head(hidden_states)
+            # Evaluation
+            if hidden_states.ndim == 3:
+                hidden_states = self.head(hidden_states, gather_dim=1)
+            else:  # Training
+                hidden_states = self.head(hidden_states, gather_dim=0)
 
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
diff --git a/internlm/moe/sharded_moe.py b/internlm/moe/sharded_moe.py
index dbee2a49..ac7613a2 100644
--- a/internlm/moe/sharded_moe.py
+++ b/internlm/moe/sharded_moe.py
@@ -12,6 +12,8 @@
 from torch import Tensor
 from torch.nn import Module
 
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 
@@ -189,7 +191,7 @@ def top1gating(
     # if we don't want to drop any tokens
     if not drop_tokens:
         new_capacity = torch.max(exp_counts).to(logits.device)
-        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group())
+        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.GLOBAL))
         capacity = new_capacity
 
     # Compute l_aux

From 6e012b148ee036471fd15b241bd1dd21e03b9efa Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Wed, 17 Jan 2024 11:24:02 +0800
Subject: [PATCH 100/153] modify expert groups

---
 .../core/context/process_group_initializer.py | 76 +++----------------
 1 file changed, 11 insertions(+), 65 deletions(-)

diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py
index 5e59df22..dcf429d3 100644
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@@ -532,67 +532,6 @@ def init_dist_group(self, use_cpu: bool = False):
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
 
-class Initializer_Expert(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for expert parallelism.
-
-    Args:
-        rank (int): The rank of current process.
-        world_size (int): Size of whole communication world.
-        data_parallel_size (int): Size of data parallel.
-        pipeline_parallel_size (int): Size of pipeline parallel.
-        tensor_parallel_size (int): Size of tensor parallel.
-        zero1_parallel_size (int): Size of zero-1 parallel.
-        expert_parallel_size (int): Size of expert parallel.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_expert_parallel_group = self.world_size // self.expert_parallel_size
-
-        assert self.world_size % self.num_expert_parallel_group == 0
-
-        # TODO: to match expert parallel with differnt data parallel size
-        assert self.data_parallel_size == self.expert_parallel_size
-
-    def init_dist_group(self, use_cpu: bool = False):
-        """Initialize expert parallel groups, and assign local_ranks and groups to each gpu.
-
-        Example: world_size = 8, model_parallel_size = 2, expert_parallel_size = 4
-            model_parallel_group = [0,1], [2,3], [4,5], [6,7]
-            expert_parallel_group = [0,2,4,6], [1,3,5,7]
-
-        Returns:
-            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-                A expert parallelism's information tuple.
-        """
-        local_rank = None
-        ranks_in_group = None
-        process_group = None
-        cpu_group = None
-        group_world_size = None
-        mode = ParallelMode.EXPERT
-
-        for i in range(self.num_expert_parallel_group):
-            ranks = list(range(i, self.world_size, self.num_expert_parallel_group))
-            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-            if use_cpu:
-                group_cpu = (
-                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                    if dist.get_backend() != "gloo"
-                    else group
-                )
-            else:
-                group_cpu = None
-            if self.rank in ranks:
-                local_rank = ranks.index(self.rank)
-                group_world_size = len(ranks)
-                process_group = group
-                cpu_group = group_cpu
-                ranks_in_group = ranks
-
-        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
-
-
 class Initializer_Expert_Data(ProcessGroupInitializer):
     """A ProcessGroupInitializer for expert data parallelism.
 
@@ -608,7 +547,9 @@ class Initializer_Expert_Data(ProcessGroupInitializer):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.num_expert_parallel_group = self.world_size // self.expert_parallel_size
+
+        self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size
+        assert self.data_parallel_size % self.expert_parallel_size == 0
 
     def _get_expert_parallel_ranks(self):
         """
@@ -620,9 +561,14 @@ def _get_expert_parallel_ranks(self):
         expert_data_parallel_group = [0,4], [2,6],      [1,5], [3,7]
         """
         data_parallel_groups = []
-        model_parallel_size = self.pipeline_parallel_size * self.tensor_parallel_size
-        for i in range(model_parallel_size):
-            data_parallel_groups.append(list(range(i, self.world_size, model_parallel_size)))
+        for i in range(self.pipeline_parallel_size):
+            for j in range(self.sequence_parallel_size):
+                data_parallel_groups.append(
+                    [
+                        i * self.ranks_num_per_pp + j + k * self.sequence_parallel_size
+                        for k in range(self.data_parallel_size)
+                    ]
+                )
 
         expert_parallel_groups = []
         expert_data_parallel_groups = []

From 18e6e78e16a62e140fd7b0179acff61c3f1f86c4 Mon Sep 17 00:00:00 2001
From: "chenxun.p" <759046501@qq.com>
Date: Wed, 17 Jan 2024 16:15:34 +0800
Subject: [PATCH 101/153] feat(isp): support interleaved pipeline parallel
 scheduler

---
 internlm/core/communication/isp.py | 127 ++++++++++++++++++++---------
 1 file changed, 88 insertions(+), 39 deletions(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index eee38c5b..e048b623 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -3,7 +3,7 @@
 
 from dataclasses import dataclass
 from functools import partial
-from typing import Dict, List, Union
+from typing import Any, Dict, List, Union
 
 import torch
 from torch import distributed as dist
@@ -141,6 +141,22 @@ def reset_lazy_pools(self) -> None:
         self._reduce_scatter_memory_pool = {}
 
 
+class ISPOverlapState:
+    def __init__(self) -> None:
+        self.num_blocks: int = 0
+        self.embedding: List[nn.Module] = []
+        self.head: List[nn.Module] = []
+        self.last_block: nn.Moudle = None
+        self.isp_outs: List[nn.Module] = []
+        self.isp_modules: List[nn.Module] = []
+        self.index_to_isp_module: Dict[int, nn.Module] = {}
+        self.module_to_index: Dict[nn.Module, int] = {}
+        self.weight_global_handle: Dict[str, Any] = {}
+        self.weight_global_output: Dict[str, torch.Tensor] = {}
+        self.bias_global_handle: Dict[str, Any] = {}
+        self.bias_global_output: Dict[str, torch.Tensor] = {}
+
+
 class ISPCommunicator:
     """
     ISP Communicator for managing the all-gather and reduce_scatter of Intern Sequence Parallel.
@@ -160,72 +176,83 @@ def __init__(
         self.overlap = overlap
         self.enable_memory_pool = overlap and enable_memory_pool
         self.model_conf = model_conf
+        self.module_name = model_conf.modules.copy()
         self.is_forward = True
+        self.reduce_scatter_handlers = {}
 
-        self._isp_outs = []
-        self._isp_modules = []
-        self._module_name = model_conf.modules.copy()
+        # real overlap state for each chunk.
+        self._overlap_states: Dict[int, ISPOverlapState] = {}
 
+        # inner interface variables of overlap state.
+        self._num_blocks = None
+        self._head = None
+        self._embedding = None
+        self._last_block = None
+        self._isp_outs = None
+        self._isp_modules = None
         # key: isp module; value: module global all-gather op handle
-        self._weight_global_handle = {}
+        self._weight_global_handle = None
         # key: isp module; value: module bias global all-gather op handle
-        self._bias_global_handle = {}
-        self.reduce_scatter_handlers = {}
+        self._bias_global_handle = None
         # key: isp module; value: module global weight after all-gather op
-        self._weight_global_output = {}
+        self._weight_global_output = None
         # key: isp module; value: module bias global weight after all-gather op
-        self._bias_global_output = {}
+        self._bias_global_output = None
         # key: isp module; value: transformer block index
-        self._module_to_index = {}
+        self._module_to_index = None
         # key: transformer block index; value: isp modules
-        self._index_to_isp_module = {}
-        self._last_block = None
-        self._head = []
-        self._embedding = []
-
-        # just want to share same for loop for ModuleList and Module
-        model = model if isinstance(model, nn.ModuleList) else [model]
-        for chunk in model:
-            if isinstance(chunk, NaiveAMPModel):
-                chunk = chunk.model
-            self._parse_model_structure(chunk)
-
-        self.num_blocks = len(self._index_to_isp_module)
+        self._index_to_isp_module = None
 
+        # init memory pool if necessary.
         if self.enable_memory_pool:
             self.memory_pool = MemoryPool(model_conf)
         else:
             self.memory_pool = None
 
+        # init overlap states if necessary.
         if self.overlap:
-            self._register_sync_parameters_hook()
+            # just want to share same for loop for modulelist and module.
+            model = model if isinstance(model, nn.ModuleList) else [model]
+            # build overlap states for every chunk.
+            for chunk_id, chunk in enumerate(model):
+                if isinstance(chunk, NaiveAMPModel):
+                    chunk = chunk.model
+                self._parse_model_structure(chunk_id, chunk)
+            # register overlap hooks for every chunk.
+            for chunk_id in range(len(model)):
+                self.switch_current_model_chunk(chunk_id)
+                self._register_sync_parameters_hook()
+            # switch to chunk 0 at first.
+            self.switch_current_model_chunk(0)
+
+    def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
+        self._overlap_states[cid] = ISPOverlapState()
 
-    def _parse_model_structure(self, model: nn.Module) -> None:
         # Important: only works for llama-class models
-        for chunk_name, children in model.named_children():
+        for _, children in model.named_children():
             if isinstance(children, ScaleColumnParallelLinear):
                 setattr(children, "isp_name", "head")
-                self._head.append(children)
+                self._overlap_states[cid].head.append(children)
             elif isinstance(children, Embedding1D):
-                self._embedding.append(children)
+                self._overlap_states[cid].embedding.append(children)
             elif isinstance(children, nn.ModuleList):
-                self._last_block = children[-1]
+                self._overlap_states[cid].last_block = children[-1]
 
                 for idx, block in enumerate(children):
-                    self._index_to_isp_module[idx] = []
+                    self._overlap_states[cid].index_to_isp_module[idx] = []
                     for sub_name, sub in block.named_children():
                         for name, child in sub.named_children():
                             if name == "out_proj":
-                                self._isp_outs.append(child)
-                                self._module_to_index[child] = idx
+                                self._overlap_states[cid].isp_outs.append(child)
+                                self._overlap_states[cid].module_to_index[child] = idx
                             if isinstance(child, ISPLinear):
-                                self._module_to_index[child] = idx
-                                self._isp_modules.append(child)
-                                self._index_to_isp_module[idx].append(child)
+                                self._overlap_states[cid].module_to_index[child] = idx
+                                self._overlap_states[cid].isp_modules.append(child)
+                                self._overlap_states[cid].index_to_isp_module[idx].append(child)
 
                                 setattr(child, "isp_name", name)
 
-                                full_name = f"{chunk_name}.{idx}.{sub_name}.{name}"
+                                full_name = f"{cid}.{idx}.{sub_name}.{name}"
                                 setattr(
                                     child.weight,
                                     "isp_reduce_scatter_name",
@@ -238,6 +265,8 @@ def _parse_model_structure(self, model: nn.Module) -> None:
                                         f"{full_name}.bias",
                                     )
 
+        self._overlap_states[cid].num_blocks = len(self._overlap_states[cid].index_to_isp_module)
+
     def _all_gather_module_weight(self, module):
         with_bias = module.bias is not None
         block_index = self._module_to_index[module]
@@ -319,7 +348,7 @@ def _pre_forward_hook_for_out_proj(self, module: nn.Module, *args):  # pylint: d
                 self._all_gather_block_weight(block_index - 1)
         else:
             # start the all-gather for next block
-            if block_index + 1 < self.num_blocks:
+            if block_index + 1 < self._num_blocks:
                 self._all_gather_block_weight(block_index + 1)
 
     def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
@@ -329,7 +358,7 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: dis
         self._wait_handle(module)
 
     def _pre_forward_hook_for_block(self, *args):  # pylint: disable=W0613
-        for module in self._index_to_isp_module[self.num_blocks - 1]:
+        for module in self._index_to_isp_module[self._num_blocks - 1]:
             self._all_gather_module_weight(module)
             self._wait_handle(module)
 
@@ -343,7 +372,7 @@ def _post_backward_hook_for_head(self, *args):  # pylint: disable=W0613
 
     def _pre_backward_hook_for_head(self, *args):  # pylint: disable=W0613
         if self.is_forward is False:
-            self._all_gather_block_weight(self.num_blocks - 1)
+            self._all_gather_block_weight(self._num_blocks - 1)
 
     def _pre_backward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
         # wait handle for current module
@@ -413,6 +442,20 @@ def _get_constant_zero(self, size: tuple) -> torch.Tensor:
                 device=self.model_conf.device,
             ).contiguous()
 
+    def switch_current_model_chunk(self, chunk_id: int) -> None:
+        self._isp_outs = self._overlap_states[chunk_id].isp_outs
+        self._isp_modules = self._overlap_states[chunk_id].isp_modules
+        self._weight_global_handle = self._overlap_states[chunk_id].weight_global_handle
+        self._bias_global_handle = self._overlap_states[chunk_id].bias_global_handle
+        self._weight_global_output = self._overlap_states[chunk_id].weight_global_output
+        self._bias_global_output = self._overlap_states[chunk_id].bias_global_output
+        self._module_to_index = self._overlap_states[chunk_id].module_to_index
+        self._index_to_isp_module = self._overlap_states[chunk_id].index_to_isp_module
+        self._last_block = self._overlap_states[chunk_id].last_block
+        self._head = self._overlap_states[chunk_id].head
+        self._embedding = self._overlap_states[chunk_id].embedding
+        self._num_blocks = self._overlap_states[chunk_id].num_blocks
+
     # communication operation interfaces
 
     def all_gather(self, tensor: torch.Tensor, module: nn.Module, is_bias: bool = False):
@@ -481,6 +524,9 @@ def __init__(self, overlap_handler: ISPCommunicator, zero_optim) -> None:
     def before_forward(self, scheduler, inputs) -> None:
         if self._isp_communicator.model_checkpoint:
             self._isp_communicator.is_forward = True
+        # switch model chunk before forward
+        chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank
+        self._isp_communicator.switch_current_model_chunk(chunk_id)
 
     def after_forward(self, scheduler, outputs) -> None:
         pass
@@ -494,6 +540,9 @@ def after_criterion(self, scheduler, loss) -> None:
     def before_backward(self, scheduler, outputs, outputs_grad) -> None:
         if self._isp_communicator.model_checkpoint:
             self._isp_communicator.is_forward = False
+        # switch model chunk before forward
+        chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank
+        self._isp_communicator.switch_current_model_chunk(chunk_id)
 
     def after_backward(self, scheduler, inputs_grad) -> None:
         # accumulate left gradients in last bucket after backward.

From 55ebba08bfc263ca4b19e0c32f5dd5ab9b45ab16 Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Wed, 17 Jan 2024 17:08:00 +0800
Subject: [PATCH 102/153] add moe group

---
 internlm/core/context/__init__.py             |  6 +-
 internlm/core/context/parallel_context.py     |  2 +-
 .../core/context/process_group_initializer.py |  2 +-
 internlm/initialize/launch.py                 |  2 +-
 internlm/model/utils.py                       | 12 ---
 .../solver/optimizer/hybrid_zero_optim.py     | 36 +++----
 internlm/solver/optimizer/utils.py            |  6 +-
 internlm/train/training_internlm.py           | 63 +++++++-----
 internlm/train/utils.py                       | 95 +++----------------
 internlm/utils/parallel.py                    |  9 ++
 10 files changed, 88 insertions(+), 145 deletions(-)

diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py
index 13da8f58..a306ad70 100644
--- a/internlm/core/context/__init__.py
+++ b/internlm/core/context/__init__.py
@@ -1,7 +1,8 @@
 from .parallel_context import (
-    IS_TENSOR_ZERO_PARALLEL,
-    IS_TENSOR_DATA_PARALLEL,
     IS_REPLICA_ZERO_PARALLEL,
+    IS_TENSOR_DATA_PARALLEL,
+    IS_TENSOR_EXPERT_DATA_PARALLEL,
+    IS_TENSOR_ZERO_PARALLEL,
     IS_WEIGHT_ZERO_PARALLEL,
     Config,
     ParallelContext,
@@ -34,6 +35,7 @@
     "IS_TENSOR_DATA_PARALLEL",
     "IS_REPLICA_ZERO_PARALLEL",
     "IS_WEIGHT_ZERO_PARALLEL",
+    "IS_TENSOR_EXPERT_DATA_PARALLEL",
     "global_context",
     "ParallelContext",
     "ParallelMode",
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 826b51a1..62f1e42d 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -24,13 +24,13 @@
 from .process_group_initializer import ParallelMode
 from .random import add_seed, get_seeds, set_mode
 
-
 IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel"
 # for isp, with optimizer split in dp group
 IS_TENSOR_DATA_PARALLEL = "is_tensor_data_parallel"
 # for mtp/msp/fsp, with optimizer split in zero1 group
 IS_TENSOR_ZERO_PARALLEL = "is_tensor_zero_parallel"
 IS_WEIGHT_ZERO_PARALLEL = "is_weight_zero_parallel"
+IS_TENSOR_EXPERT_DATA_PARALLEL = "is_tensor_expert_data_parallel"
 
 logger = get_logger(__file__)
 
diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py
index dcf429d3..b6e72527 100644
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@@ -554,7 +554,7 @@ def __init__(self, *args, **kwargs):
     def _get_expert_parallel_ranks(self):
         """
         Create expert and data parallel groups
-        Example: world_size = 8, model_parallel_size = 2, expert_parallel_size = 2
+        Example: world_size = 8, tensor_parallel_size = 2, expert_parallel_size = 2
         model_parallel_group = [0,1], [2,3], [4,5], [6,7]
         data_parallel_group = [0,2,4,6],                [1,3,5,7]
         expert_parallel_group = [0,2], [4,6],           [1,3], [5,7]
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index eedb0e65..ee9f4d4a 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -377,7 +377,7 @@ def args_sanity_check():
         assert (
             not optim_ckpt.overlap_sync_grad & optim_ckpt.overlap_sync_param
         ), "not support overlap and moe at the same time"
-        assert gpc.config.parallel.zero1.size == -1, "moe only support zero1, set zero1=dict(size=-1,...) can fix this"
+        assert gpc.config.parallel.zero1.size == gpc.get_world_size(ParallelMode.DATA), "moe only support zero1"
 
 
 def launch(
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 1e6d76b0..48eb4b78 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -724,18 +724,6 @@ def is_moe_param(param: torch.Tensor) -> bool:
     return False
 
 
-def is_gate_param(param: torch.Tensor) -> bool:
-    if hasattr(param, "is_gate") and param.is_gate:
-        return True
-    return False
-
-
-def is_norm_param(param: torch.Tensor) -> bool:
-    if hasattr(param, "is_norm") and param.is_norm:
-        return True
-    return False
-
-
 def Silu(w1_o, w2_o):
     return F.silu(w1_o) * w2_o
 
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index f7ce3bdc..9ef3aecf 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -13,8 +13,9 @@
 from internlm.core.context import global_context as gpc
 from internlm.core.context.parallel_context import (
     IS_TENSOR_DATA_PARALLEL,
-    IS_WEIGHT_ZERO_PARALLEL,
+    IS_TENSOR_EXPERT_DATA_PARALLEL,
     IS_TENSOR_ZERO_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
 )
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
@@ -167,6 +168,8 @@ def __init__(
             # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
             self._broadcast_parallel_mode.append(zero_mode)
 
+            if self._is_moe_group(param_group):
+                grad_reduce_mode = ParallelMode.EXPERT_DATA
             if param_group["name"] != "embed_head" and self.use_isp:
                 grad_reduce_mode = ParallelMode.WEIGHT_DATA
             else:
@@ -288,12 +291,6 @@ def _partition_param_list(self, group_id, param_group):
     def _is_moe_group(self, param_group):
         return "moe" in param_group.keys() and param_group["moe"]
 
-    def _is_norm_group(self, param_group):
-        return "norm" in param_group.keys() and param_group["norm"]
-
-    def _is_gate_group(self, param_group):
-        return "gate" in param_group.keys() and param_group["gate"]
-
     # TODO check expert dp is correct when enable moe and overlap both
     def _attach_reduction_hook(self):
         # we iterate over the fp16 params
@@ -619,17 +616,21 @@ def _compute_norm_with_stage(
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
-            if group_id == 0:
+            if self.optim.param_groups[group_id]["name"] in ("default", "fp32"):
                 for param in params:
                     if self.use_isp:
                         setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
                     else:
                         setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
-            elif group_id == 1:
+            elif self.optim.param_groups[group_id]["name"] == "embed_head":
+                # should be isp mode
                 for param in params:
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            elif self._is_moe_group(self.optim.param_groups[group_id]):
+                for param in params:
+                    setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True)
             else:
-                raise NotImplementedError("group_id > 1 is not yet implemented.")
+                raise NotImplementedError("unrecognized parameter group.")
 
         norm = 0
         if self._clip_grad_norm > 0:
@@ -652,6 +653,8 @@ def _compute_norm_with_stage(
                     delattr(param, IS_TENSOR_ZERO_PARALLEL)
                 if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
                     delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL)
 
         return norm
 
@@ -830,19 +833,6 @@ def _step(self, closure=None, norms=None):
                 param_shape == flat_fp32_avg_grads.shape
             ), f"fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}"
 
-            # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
-            # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
-            is_tp_sync_groups = (
-                self._is_norm_group(self.optim.param_groups[group_id]),
-                self._is_gate_group(self.optim.param_groups[group_id]),
-            )
-            if any(is_tp_sync_groups):
-                dist.all_reduce(
-                    flat_fp32_avg_grads,
-                    op=dist.ReduceOp.AVG,
-                    group=gpc.get_group(ParallelMode.TENSOR),
-                )
-
             single_grad_partition_groups.append(flat_fp32_avg_grads)
             device = self._fp32_flat_param_groups_of_current_rank[group_id].device
             self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 42a9949f..184b715e 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -20,6 +20,7 @@
 from internlm.utils.parallel import (
     is_replica_zero_parallel_parameter,
     is_tensor_data_parallel_parameter,
+    is_tensor_expert_data_parallel_parameter,
     is_tensor_zero_parallel_parameter,
     is_weight_zero_parallel_parameter,
 )
@@ -255,6 +256,9 @@ def append_grad(g, p):
         elif is_weight_zero_parallel_parameter(p):
             # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group
             append_grad(g, p)
+        elif is_tensor_expert_data_parallel_parameter(p):
+            # process all ranks for IS_TENSOR_EXPERT_DATA_PARALLEL parameter group
+            append_grad(g, p)
         elif gpc.get_local_rank(weight_parallel_mode) != 0:
             continue
         else:
@@ -324,7 +328,7 @@ def compute_norm(
 
         """
         Sum across all model-parallel GPUs.
-        1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and 
+        1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and
             gradients along the pp+zero dimensions from all ranks should be aggregated.
         2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions from all ranks should be aggregated.
         3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions from all ranks should be aggregated.
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2822da5a..5d7b8926 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -20,6 +20,15 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.utils.data import ConcatDataset, DataLoader
 
+from internlm.core.communication.isp import ISPCommModelConfig, ISPCommunicator
+from internlm.core.context import (
+    IS_REPLICA_ZERO_PARALLEL,
+    IS_TENSOR_DATA_PARALLEL,
+    IS_TENSOR_EXPERT_DATA_PARALLEL,
+    IS_TENSOR_ZERO_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
+    ParallelMode,
+)
 from internlm.core.context import global_context as gpc
 from internlm.core.context.random import set_mode
 from internlm.core.naive_amp import NaiveAMPModel
@@ -36,17 +45,17 @@
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
-    FeedForward,
-    RewardModelLinear,
-    ScaleColumnParallelLinear,
     BaseScaleColumnParallelLinear,
     ColumnParallelLinear,
+    FeedForward,
+    ISPLinear,
+    RewardModelLinear,
     RowParallelLinear,
+    ScaleColumnParallelLinear,
 )
+from internlm.model.moe import MoE
 from internlm.model.multi_head_attention import MHA
-from internlm.model.linear import ISPLinear
-from internlm.core.communication.isp import ISPCommunicator, ISPCommModelConfig
-from internlm.model.utils import try_import_RMSNorm
+from internlm.model.utils import is_moe_param, try_import_RMSNorm
 from internlm.monitor import send_heartbeat, set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.solver.beta2_scheduler import Beta2Scheduler
@@ -58,25 +67,17 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
+    is_replica_zero_parallel_parameter,
+    is_tensor_data_parallel_parameter,
+    is_tensor_expert_data_parallel_parameter,
+    is_tensor_zero_parallel_parameter,
+    is_weight_zero_parallel_parameter,
     set_model_params_layer_name,
     sync_model_param,
     sync_model_replica_param_group,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.timeout import llm_timeout
-from internlm.core.context import (
-    IS_TENSOR_ZERO_PARALLEL,
-    IS_REPLICA_ZERO_PARALLEL,
-    IS_TENSOR_DATA_PARALLEL,
-    IS_WEIGHT_ZERO_PARALLEL,
-    ParallelMode,
-)
-from internlm.utils.parallel import (
-    is_replica_zero_parallel_parameter,
-    is_tensor_data_parallel_parameter,
-    is_tensor_zero_parallel_parameter,
-    is_weight_zero_parallel_parameter,
-)
 
 RMSNorm = try_import_RMSNorm()
 logger = get_logger(__file__)
@@ -89,10 +90,12 @@ def _check_module(module):
             for param in module.parameters():
                 setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
+        if isinstance(module, MoE):
+            for param in module.moe_layer.gate.parameters():
+                setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
+
         # embedding and head
-        if isinstance(module, (Embedding1D, ParallelGPT2Embeddings)) or isinstance(
-            module, BaseScaleColumnParallelLinear
-        ):
+        if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)):
             for param in module.parameters():
                 if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp":
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
@@ -102,9 +105,20 @@ def _check_module(module):
         # for linear module
         if isinstance(module, (ColumnParallelLinear, RowParallelLinear)):
             for param in module.parameters():
-                if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp":
+                if gpc.is_initialized(ParallelMode.EXPERT_DATA) and is_moe_param(param):
+                    # module should be MoE experts's linear
+                    setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True)
+                elif (
+                    not is_moe_param(param)
+                    and gpc.is_initialized(ParallelMode.TENSOR)
+                    and gpc.config.parallel.tensor.mode != "isp"
+                ):
                     setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
-                elif gpc.is_initialized(ParallelMode.WEIGHT) and gpc.config.parallel.tensor.mode == "isp":
+                elif (
+                    not is_moe_param(param)
+                    and gpc.is_initialized(ParallelMode.WEIGHT)
+                    and gpc.config.parallel.tensor.mode == "isp"
+                ):
                     setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
 
     if not isinstance(model, nn.ModuleList):
@@ -123,6 +137,7 @@ def _check_module(module):
                 or is_tensor_data_parallel_parameter(param)
                 or is_tensor_zero_parallel_parameter(param)
                 or is_weight_zero_parallel_parameter(param)
+                or is_tensor_expert_data_parallel_parameter(param)
             ), f"parameter with name:{name} has no parallel attribution."
 
 
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 58880bb8..cd9ed0ac 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -4,84 +4,10 @@
 
 from internlm.core.context.parallel_context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
-from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
+from internlm.model.utils import is_moe_param
 from internlm.utils.parallel import is_tensor_data_parallel_parameter
 
 
-def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]:
-    """Split parameters into different groups for optimizer
-
-    Args:
-        param_groups (Tuple[Dict]): The list of parameter groups to split
-        Input Example:
-        >>> (
-        >>>     {'name': 'default', 'params': [tensor], 'weight_decay' :xxx},
-        >>> )
-
-    Returns:
-        Tuple[Dict]: list of params groups for optimizer
-        Output Example:
-        >>> (
-        >>>     {'name': 'default','params': [tensor],'weight_decay' :xxx},
-        >>>     {'name': 'fp32', 'params': [tensor],'weight_decay' :xxx},
-        >>>     {'name': 'norm', 'norm': True, 'params': [tensor],'weight_decay' :xxx},
-        >>>     {'name': 'gate', 'gate': True, 'params': [tensor],'weight_decay' :xxx},
-        >>>     {'name': 'moe_ep_size_4', 'moe': True, 'params':  [tensor],'weight_decay' :xxx},
-        >>> )
-    """
-
-    if isinstance(param_groups, tuple):
-        param_groups = list(param_groups)  # Tuple cannot be modified
-    elif isinstance(param_groups, dict):
-        param_groups = [param_groups]
-    elif not isinstance(param_groups, list):
-        raise ValueError(f"Unknown param group type of {type(param_groups)}")
-
-    # create new groups for fp32, norm, moe gate and moe expert
-    new_groups = {}
-    new_groups["fp32"] = {"name": "fp32", "params": [], "dp_mode": ParallelMode.DATA}
-    if gpc.config.model.get("num_experts", 0) > 1:
-        # norm and gate are special group to force sync (when enable MoE).
-        for key in ["gate", "norm"]:
-            new_groups[key] = {"name": key, key: True, "params": [], "dp_mode": ParallelMode.DATA}
-        for key in gpc.expert_parallel_group_names:
-            new_groups[key] = {"name": key, "moe": True, "params": [], "dp_mode": ParallelMode.EXPERT_DATA}
-
-    for pgroup in param_groups:
-        # copy attribute from origin group, we assume the input param_groups only
-        # have one group, so the attribute will not be copyed multiple times.
-        for ori_key in pgroup.keys():
-            if ori_key not in ("name", "params"):
-                for _, group in new_groups.items():
-                    group[ori_key] = pgroup[ori_key]
-        # assign param
-        origin_params = []
-        # first split the norm and gate groups, which are special case to force sync (when enable MoE),
-        # then fp32 group and the moe group.
-        for param in pgroup["params"]:
-            if gpc.config.model.get("num_experts", 0) > 1 and is_norm_param(param):
-                new_groups["norm"]["params"].append(param)
-            # gate param means MoE is enabled
-            elif is_gate_param(param):
-                new_groups["gate"]["params"].append(param)
-            elif param.dtype == torch.float32:
-                new_groups["fp32"]["params"].append(param)
-            # moe param means MoE is enabled
-            elif is_moe_param(param):
-                new_groups[param.group_name]["params"].append(param)
-            else:
-                origin_params.append(param)
-
-        # bf16 param group, which is the first group in the param groups
-        pgroup["params"] = origin_params
-        pgroup["dp_mode"] = ParallelMode.DATA
-
-    # param groups may contain empty groups, such as fp32
-    param_groups.extend(new_groups.values())
-
-    return tuple(param_groups)
-
-
 def split_params_into_different_groups_for_optimizer_with_new_partition_strategy(
     param_groups: Tuple[Dict],
 ) -> Tuple[Dict]:
@@ -114,9 +40,15 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
 
     # create new groups for IS_TENSOR_DATA_PARALLEL parameter group
     new_groups = {}
-    new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
+    if gpc.config.parallel.tensor.mode == "isp":
+        new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
+    new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1}
     # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1}
 
+    if gpc.config.model.get("num_experts", 0) > 1:
+        for key in gpc.expert_parallel_group_names:
+            new_groups[key] = {"name": key, "moe": True, "params": [], "optimizer_mode": ParallelMode.EXPERT_DATA}
+
     for pgroup in param_groups:
         # copy attribute from origin group, we assume the input param_groups only
         # have one group, so the attribute will not be copyed multiple times.
@@ -128,9 +60,15 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         origin_params = []
         for param in pgroup["params"]:
             if is_tensor_data_parallel_parameter(param):
+                # should not be here if not isp mode
                 new_groups["embed_head"]["params"].append(param)
             # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True:
             #     new_groups["layer_norm"]["params"].append(param)
+            elif param.dtype == torch.float32:
+                new_groups["fp32"]["params"].append(param)
+            # moe param means MoE is enabled
+            elif is_moe_param(param):
+                new_groups[param.group_name]["params"].append(param)
             else:
                 origin_params.append(param)
 
@@ -139,10 +77,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         pgroup["optimizer_mode"] = ParallelMode.ZERO1
 
     # param groups may contain empty groups, such as embed_head
-    if gpc.config.parallel.tensor.mode == "isp":
-        param_groups.extend(new_groups.values())
-    else:
-        assert len(new_groups["embed_head"]["params"]) <= 0
+    param_groups.extend(new_groups.values())
 
     # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True)
     # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True)
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index e354f3b2..e66612f8 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -7,6 +7,7 @@
 from internlm.core.context import (
     IS_REPLICA_ZERO_PARALLEL,
     IS_TENSOR_DATA_PARALLEL,
+    IS_TENSOR_EXPERT_DATA_PARALLEL,
     IS_TENSOR_ZERO_PARALLEL,
     IS_WEIGHT_ZERO_PARALLEL,
     ParallelMode,
@@ -46,6 +47,14 @@ def is_weight_zero_parallel_parameter(p):
     )
 
 
+def is_tensor_expert_data_parallel_parameter(p):
+    return (
+        gpc.is_initialized(ParallelMode.TENSOR)
+        and hasattr(p, IS_TENSOR_EXPERT_DATA_PARALLEL)
+        and getattr(p, IS_TENSOR_EXPERT_DATA_PARALLEL)
+    )
+
+
 def sync_model_param(model):
     r"""Make sure data parameters are consistent during Data Parallel Mode.
 

From ab039d5b7bc0d5d4e8e0cbff97432fced146e2af Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 17 Jan 2024 17:22:59 +0800
Subject: [PATCH 103/153] fix(isp.py): fix comment

---
 internlm/core/communication/isp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index e048b623..ea628466 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -540,7 +540,7 @@ def after_criterion(self, scheduler, loss) -> None:
     def before_backward(self, scheduler, outputs, outputs_grad) -> None:
         if self._isp_communicator.model_checkpoint:
             self._isp_communicator.is_forward = False
-        # switch model chunk before forward
+        # switch model chunk before backward
         chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank
         self._isp_communicator.switch_current_model_chunk(chunk_id)
 

From 8347ab49e8fe665fef96a637a1af6b2810531d62 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 17 Jan 2024 17:37:15 +0800
Subject: [PATCH 104/153] feat(model): remove useless debug print

---
 internlm/model/embedding.py            |  4 ----
 internlm/model/modeling_internlm.py    | 12 ------------
 internlm/model/multi_head_attention.py |  8 --------
 internlm/train/utils.py                |  9 ---------
 sort_log.py                            | 17 -----------------
 5 files changed, 50 deletions(-)
 delete mode 100644 sort_log.py

diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py
index 11c71b2c..d1770538 100644
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@@ -59,10 +59,6 @@ def forward(self, input_: Tensor) -> Tensor:
 
         if gpc.config.parallel.sequence_parallel:
             output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
-            # print(
-            #     f"ht debug embed: rank:{gpc.get_global_rank()} output.shape:{output.shape} output:{output}",
-            #     flush=True,
-            # )
 
         return output
 
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 032fef91..32c1c7b0 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -231,14 +231,7 @@ def _dropout_and_norm_ffn(_residual, _hidden_states):
         if self.residual_in_fp32:
             residual = residual.to(torch.float32)
 
-        # print(
-        #     f"ht debug mlp rank:{gpc.get_global_rank()} input.shape:{hidden_states.shape} input:{hidden_states}",
-        #     flush=True,
-        # )
         hidden_states = self.mlp(hidden_states)
-        # print(
-        #     f"ht debug mlp rank:{gpc.get_global_rank()} out.shape:{hidden_states.shape} out:{hidden_states}", flush=True
-        # )
 
         return hidden_states + residual
 
@@ -423,11 +416,6 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             else:  # Training
                 hidden_states = self.head(hidden_states, gather_dim=0)
 
-            # print(
-            #     f"ht debug head rank:{gpc.get_global_rank()} hidden_states.shape:{hidden_states.shape} hidden_states:{hidden_states}",
-            #     flush=True,
-            # )
-
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
         return hidden_states
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index eba5a6f1..2010e2f5 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -433,11 +433,9 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
                 split x during sequence parallel, we split the batch * seqlen dimension
                 (in case batch is small).
         """
-        # print(f"ht debug mha rank:{gpc.get_global_rank()} wqkv.shape:{self.Wqkv.weight.shape} wqkv:{self.Wqkv.weight}")
         qkv = self.Wqkv(x)  # total x hsz'
         qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim)  # total x 3 x n_head x d
         qkv = self.rotary_emb(qkv, **kwargs)
-        # print(f"ht debug mha rank:{gpc.get_global_rank()} qkv.shape:{qkv.shape} qkv:{qkv}", flush=True)
         kwargs.pop("indexes")
         if inference_params is None:
             if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
@@ -452,12 +450,6 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
             raise RuntimeError("Not support this right now")
 
         context = rearrange(context, "b h d -> b (h d)")  # recover the shape
-        # print(f"ht debug mha rank:{gpc.get_global_rank()} context.shape:{context.shape} context:{context}")
-        # print(
-        #     f"ht debug mha rank:{gpc.get_global_rank()} out_proj.shape:{self.out_proj.weight.shape} out_proj:{self.out_proj.weight}"
-        # )
         out = self.out_proj(context)
 
-        # print(f"ht debug mha rank:{gpc.get_global_rank()} out.shape:{out.shape} out:{out}")
-
         return out
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 58880bb8..5c78b5e0 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -110,12 +110,9 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
     elif not isinstance(param_groups, list):
         raise ValueError(f"Unknown param group type of {type(param_groups)}")
 
-    # print(f"ht debug params_groups before split total len:{len(param_groups[0]['params'])}", flush=True)
-
     # create new groups for IS_TENSOR_DATA_PARALLEL parameter group
     new_groups = {}
     new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
-    # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1}
 
     for pgroup in param_groups:
         # copy attribute from origin group, we assume the input param_groups only
@@ -129,8 +126,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         for param in pgroup["params"]:
             if is_tensor_data_parallel_parameter(param):
                 new_groups["embed_head"]["params"].append(param)
-            # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True:
-            #     new_groups["layer_norm"]["params"].append(param)
             else:
                 origin_params.append(param)
 
@@ -144,10 +139,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
     else:
         assert len(new_groups["embed_head"]["params"]) <= 0
 
-    # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True)
-    # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True)
-    # print(f"ht debug params_groups after split layer_norm len:{len(param_groups[2]['params'])}", flush=True)
-
     return tuple(param_groups)
 
 
diff --git a/sort_log.py b/sort_log.py
deleted file mode 100644
index 786c2282..00000000
--- a/sort_log.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import re
-
-# 读取日志信息
-with open("ht.log", "r") as file:
-    log_content = file.read()
-
-# 使用正则表达式提取以 "ht debug" 开头、以 "dtype=***" 结尾的日志信息块
-log_blocks = re.findall(r"ht debug.*?device=[^\n]*", log_content, re.DOTALL)
-
-# 将日志信息块按照 "rank:" 后的整数值进行正序排序
-sorted_log_blocks = sorted(log_blocks, key=lambda x: int(re.search(r"rank:(\d+)", x).group(1)))
-
-# 将排序后的日志信息块写入新的文件
-with open("sorted.log", "w") as file:
-    file.write("\n\n".join(sorted_log_blocks))
-
-print("日志信息块已按照 rank: 后的整数值进行正序排序，并保存在 sorted_log_blocks.txt 文件中。")

From 7ed1109c7091f5b498db14c2d342197e837f739e Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 17 Jan 2024 17:51:19 +0800
Subject: [PATCH 105/153] feat(model): fix lint error

---
 internlm/core/communication/isp.py            |  4 ++
 internlm/core/context/__init__.py             |  4 +-
 internlm/core/context/parallel_context.py     |  1 -
 internlm/core/scheduler/pipeline_scheduler.py |  5 +--
 internlm/model/linear.py                      |  6 ++-
 internlm/model/modeling_internlm.py           |  1 -
 .../solver/optimizer/hybrid_zero_optim.py     |  2 +-
 internlm/solver/optimizer/utils.py            | 12 ++++--
 internlm/train/training_internlm.py           | 38 +++++++++----------
 internlm/utils/model_checkpoint.py            | 15 ++++----
 10 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index ea628466..53fd731e 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -142,6 +142,10 @@ def reset_lazy_pools(self) -> None:
 
 
 class ISPOverlapState:
+    """
+    Overlap state for isp.
+    """
+
     def __init__(self) -> None:
         self.num_blocks: int = 0
         self.embedding: List[nn.Module] = []
diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py
index 13da8f58..8ff56c31 100644
--- a/internlm/core/context/__init__.py
+++ b/internlm/core/context/__init__.py
@@ -1,7 +1,7 @@
 from .parallel_context import (
-    IS_TENSOR_ZERO_PARALLEL,
-    IS_TENSOR_DATA_PARALLEL,
     IS_REPLICA_ZERO_PARALLEL,
+    IS_TENSOR_DATA_PARALLEL,
+    IS_TENSOR_ZERO_PARALLEL,
     IS_WEIGHT_ZERO_PARALLEL,
     Config,
     ParallelContext,
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 826b51a1..378faebd 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -24,7 +24,6 @@
 from .process_group_initializer import ParallelMode
 from .random import add_seed, get_seeds, set_mode
 
-
 IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel"
 # for isp, with optimizer split in dp group
 IS_TENSOR_DATA_PARALLEL = "is_tensor_data_parallel"
diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py
index 622c91f6..778331ee 100644
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@@ -133,10 +133,7 @@ def __init__(
             tensor_shape if tensor_shape is None or isinstance(tensor_shape, torch.Size) else torch.Size(tensor_shape)
         )
 
-        self.scatter_gather_tensors = (
-            scatter_gather_tensors
-            and gpc.is_using_parallel_mode(ParallelMode.TENSOR)
-        )
+        self.scatter_gather_tensors = scatter_gather_tensors and gpc.is_using_parallel_mode(ParallelMode.TENSOR)
 
         if gpc.config.parallel.sequence_parallel:
             self.scatter_gather_tensors = False
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index ed21a21b..9506f608 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -12,8 +12,8 @@
 from internlm.core.context import global_context as gpc
 from internlm.model.utils import (
     Silu,
-    isp_fused_dense_func,
     fused_dense_func_torch,
+    isp_fused_dense_func,
     megatron_fused_dense_func_torch,
 )
 
@@ -351,6 +351,10 @@ def __init__(
 
 
 class ISPLinear(ColumnParallelLinear):
+    """
+    Linear class for isp tensor parallel mode.
+    """
+
     # class level communicator variable.
     __communicator = None
 
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 32c1c7b0..7bb9ffa7 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -31,7 +31,6 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.registry import MODEL_INITIALIZER
 
-
 MODEL_TYPE = "INTERNLM"
 
 logger = get_logger(__file__)
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index f7ce3bdc..1445509d 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -13,8 +13,8 @@
 from internlm.core.context import global_context as gpc
 from internlm.core.context.parallel_context import (
     IS_TENSOR_DATA_PARALLEL,
-    IS_WEIGHT_ZERO_PARALLEL,
     IS_TENSOR_ZERO_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
 )
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 42a9949f..a9027705 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -324,11 +324,14 @@ def compute_norm(
 
         """
         Sum across all model-parallel GPUs.
-        1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and 
+        1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and
             gradients along the pp+zero dimensions from all ranks should be aggregated.
-        2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions from all ranks should be aggregated.
-        3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions from all ranks should be aggregated.
-        4. For the IS_WEIGHT_ZERO_PARALLEL parameter group, gradients along the wp+pp+zero dimensions from all ranks should be aggregated.
+        2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions
+            from all ranks should be aggregated.
+        3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions
+            from all ranks should be aggregated.
+        4. For the IS_WEIGHT_ZERO_PARALLEL parameter group, gradients along the wp+pp+zero dimensions
+            from all ranks should be aggregated.
         """
         if is_tensor_data_parallel_parameter(parameters[0]):
             if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
@@ -368,6 +371,7 @@ def compute_norm(
     return total_norm
 
 
+# ht mark: TODO
 def compute_param_norm(
     gradients,
     parameters,
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2822da5a..31b6238f 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -20,6 +20,14 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.utils.data import ConcatDataset, DataLoader
 
+from internlm.core.communication.isp import ISPCommModelConfig, ISPCommunicator
+from internlm.core.context import (
+    IS_REPLICA_ZERO_PARALLEL,
+    IS_TENSOR_DATA_PARALLEL,
+    IS_TENSOR_ZERO_PARALLEL,
+    IS_WEIGHT_ZERO_PARALLEL,
+    ParallelMode,
+)
 from internlm.core.context import global_context as gpc
 from internlm.core.context.random import set_mode
 from internlm.core.naive_amp import NaiveAMPModel
@@ -36,16 +44,15 @@
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
-    FeedForward,
-    RewardModelLinear,
-    ScaleColumnParallelLinear,
     BaseScaleColumnParallelLinear,
     ColumnParallelLinear,
+    FeedForward,
+    ISPLinear,
+    RewardModelLinear,
     RowParallelLinear,
+    ScaleColumnParallelLinear,
 )
 from internlm.model.multi_head_attention import MHA
-from internlm.model.linear import ISPLinear
-from internlm.core.communication.isp import ISPCommunicator, ISPCommModelConfig
 from internlm.model.utils import try_import_RMSNorm
 from internlm.monitor import send_heartbeat, set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
@@ -58,25 +65,16 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
+    is_replica_zero_parallel_parameter,
+    is_tensor_data_parallel_parameter,
+    is_tensor_zero_parallel_parameter,
+    is_weight_zero_parallel_parameter,
     set_model_params_layer_name,
     sync_model_param,
     sync_model_replica_param_group,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.timeout import llm_timeout
-from internlm.core.context import (
-    IS_TENSOR_ZERO_PARALLEL,
-    IS_REPLICA_ZERO_PARALLEL,
-    IS_TENSOR_DATA_PARALLEL,
-    IS_WEIGHT_ZERO_PARALLEL,
-    ParallelMode,
-)
-from internlm.utils.parallel import (
-    is_replica_zero_parallel_parameter,
-    is_tensor_data_parallel_parameter,
-    is_tensor_zero_parallel_parameter,
-    is_weight_zero_parallel_parameter,
-)
 
 RMSNorm = try_import_RMSNorm()
 logger = get_logger(__file__)
@@ -90,9 +88,7 @@ def _check_module(module):
                 setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
         # embedding and head
-        if isinstance(module, (Embedding1D, ParallelGPT2Embeddings)) or isinstance(
-            module, BaseScaleColumnParallelLinear
-        ):
+        if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)):
             for param in module.parameters():
                 if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp":
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
index 5c21af90..4fe45d5e 100644
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@@ -281,7 +281,6 @@ def save_model_checkpoint(folder, model):
     if folder is not None:
         dp_size = gpc.get_world_size(ParallelMode.DATA)
         tp_size = gpc.get_world_size(ParallelMode.TENSOR)
-        wp_size = gpc.get_world_size(ParallelMode.WEIGHT)
         dp_rank = gpc.get_local_rank(ParallelMode.DATA)
         tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
         wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
@@ -574,13 +573,15 @@ def load_optimizer_checkpoint(folder, optim):
     dp_size = gpc.get_world_size(ParallelMode.DATA)
 
     if gpc.config.parallel.tensor.mode == "isp":
-        assert (
-            dp_size == max_dp + 1
-        ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism"
+        assert dp_size == max_dp + 1, (
+            f"The optimizer states are save for {max_dp+1} data parallelism, "
+            f"while current has {dp_size} data parallelism"
+        )
     if gpc.config.parallel.tensor.mode != "isp":
-        assert (
-            zero_size == max_zero + 1
-        ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range."
+        assert zero_size == max_zero + 1, (
+            f"The optimizer states are save for {max_zero+1} zero parallel, "
+            f"while current has {zero_size} zero broadcast range."
+        )
     assert (
         pp_size == max_pp + 1
     ), f"The optimizer states are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"

From ccc2108440530b46fe2be92f37ba6f27062646c2 Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Thu, 18 Jan 2024 15:16:14 +0800
Subject: [PATCH 106/153] refactor code

---
 internlm/train/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 6b62dbc8..b4a98db9 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -41,7 +41,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
     if gpc.config.parallel.tensor.mode == "isp":
         new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
     new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1}
-    # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1}
 
     if gpc.config.model.get("num_experts", 0) > 1:
         for key in gpc.expert_parallel_group_names:

From fac2b200e3c98ad895e0048f47c68b71dc2de2db Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Thu, 18 Jan 2024 17:37:54 +0800
Subject: [PATCH 107/153] refactor code

---
 internlm/train/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index b4a98db9..76c375de 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -59,8 +59,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
             if is_tensor_data_parallel_parameter(param):
                 # should not be here if not isp mode
                 new_groups["embed_head"]["params"].append(param)
-            # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True:
-            #     new_groups["layer_norm"]["params"].append(param)
             elif param.dtype == torch.float32:
                 new_groups["fp32"]["params"].append(param)
             # moe param means MoE is enabled

From 05fa04a33cb331bd4245b4271eb6e111cbfd55b9 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 19 Jan 2024 11:32:09 +0800
Subject: [PATCH 108/153] feat(multi_head_attention.py): set bias=True

---
 internlm/model/multi_head_attention.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 023b5478..87e2d42a 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -209,7 +209,7 @@ def __init__(
             embed_dim,
             3 * embed_dim,
             process_group,
-            bias=False,
+            bias=True,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             **factory_kwargs,
         )  # according to https://spaces.ac.cn/archives/9577
@@ -232,7 +232,7 @@ def __init__(
             embed_dim,
             embed_dim,
             process_group,
-            bias=False,
+            bias=True,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             **factory_kwargs,
         )

From 91bd3f96608351dd99b7f1ac7f77eb05aa92fa78 Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Fri, 19 Jan 2024 14:51:09 +0800
Subject: [PATCH 109/153] fix bugs

---
 internlm/solver/optimizer/hybrid_zero_optim.py | 2 +-
 internlm/train/utils.py                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 13139938..978a7b4f 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -177,7 +177,7 @@ def __init__(
 
             if self._is_moe_group(param_group):
                 grad_reduce_mode = ParallelMode.EXPERT_DATA
-            if param_group["name"] != "embed_head" and self.use_isp:
+            elif param_group["name"] != "embed_head" and self.use_isp:
                 grad_reduce_mode = ParallelMode.WEIGHT_DATA
             else:
                 grad_reduce_mode = ParallelMode.DATA
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 76c375de..4444b30d 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -42,7 +42,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
     new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1}
 
-    if gpc.config.model.get("num_experts", 0) > 1:
+    if gpc.config.model.get("num_experts", 1) > 1:
         for key in gpc.expert_parallel_group_names:
             new_groups[key] = {"name": key, "moe": True, "params": [], "optimizer_mode": ParallelMode.EXPERT_DATA}
 

From 20f6b36108aed9409fa68cc2fb7b7381d04d22e2 Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Fri, 19 Jan 2024 15:01:11 +0800
Subject: [PATCH 110/153] support moe checkpoint

---
 internlm/utils/model_checkpoint.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
index 322ddf1e..25455231 100644
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@@ -354,14 +354,14 @@ def save_model_checkpoint(folder, model):
                         llm_save(topo_fp, saved_obj=topo)
 
         # try to save expert parameter to separate files if model have moe layer
-        # expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA)
-        # expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA)
-        # should_save_rank_pair.clear()
-        # for i in range(tp_size):
-        #     should_save_rank_pair.add((i, i % expert_dp_size))
-
-        # if (tp_rank, expert_dp_rank) in should_save_rank_pair:
-        #     try_save_moe_checkpoint(folder, model, tp_rank, pp_rank)
+        expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA)
+        expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA)
+        should_save_rank_pair.clear()
+        for i in range(tp_size):
+            should_save_rank_pair.add((i, i % expert_dp_size))
+
+        if (tp_rank, expert_dp_rank) in should_save_rank_pair:
+            try_save_moe_checkpoint(folder, model, tp_rank, pp_rank)
 
     torch.distributed.barrier()
 

From 7cdeea870de5bc556dcb8515aad6d9707a20daba Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 19 Jan 2024 16:58:27 +0800
Subject: [PATCH 111/153] fix(tests): fix ci test error

---
 internlm/utils/parallel.py                    | 23 --------------
 tests/test_core/utils.py                      |  9 ++----
 tests/test_training/test_loss.py              |  3 +-
 .../test_swap_nb_loss_and_gradnorm.py         | 31 +++++++++----------
 tests/test_training/train_CI.py               |  3 +-
 5 files changed, 19 insertions(+), 50 deletions(-)

diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index 52ffc114..2614fe11 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -132,26 +132,3 @@ def set_model_params_layer_name(model):
                     layer_param_name = f"{layer_name}-{param_name}"
                     param.__setattr__("layer_name", layer_name)
                     param.__setattr__("param_name", f"{layer_name}-{param_name}")
-
-
-def check_sequence_parallel(model):
-    """
-    check whether the norm module has IS_SEQUENCE_PARALLEL attribute.
-    when the sequence_parallel is True, the norm module should have the IS_SEQUENCE_PARALLEL attribute
-    to illustrate the norm should conduct the all-reduce for its grad.
-    """
-
-    if not isinstance(model, nn.ModuleList):
-        model = [model]
-
-    for _chunk in model:
-        if isinstance(_chunk, NaiveAMPModel):
-            _chunk = _chunk.model
-
-        for _, module in _chunk.named_modules():
-            if isinstance(module, (RMSNorm, nn.LayerNorm)):
-                for param in module.parameters():
-                    assert hasattr(param, IS_SEQUENCE_PARALLEL), (
-                        "when the gpc.config.parallel.sequence parallel is True,"
-                        "the params of norm module should have IS_SEQUENCE_PARALLEL attribute"
-                    )
diff --git a/tests/test_core/utils.py b/tests/test_core/utils.py
index 6f66a152..3d25667f 100644
--- a/tests/test_core/utils.py
+++ b/tests/test_core/utils.py
@@ -10,12 +10,8 @@
 from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
 from internlm.core.gradient_handler import PipelineSharedModuleGradientHandler
-from internlm.core.scheduler import (
-    InterleavedPipelineScheduler,
-    NonPipelineScheduler,
-    PipelineScheduler,
-    SchedulerMetricHook,
-)
+from internlm.core.scheduler import InterleavedPipelineScheduler, NonPipelineScheduler, PipelineScheduler
+from internlm.model.metrics import SchedulerMetricHook
 from internlm.solver.pipeline_utils import partition_uniform
 from internlm.train import initialize_optimizer
 
@@ -156,7 +152,6 @@ def build_environment(rank, world_size, config):
 
 
 def loose_close(a, b, dtype: torch.dtype = torch.float32):
-
     if dtype is torch.float32:
         rtol = 1.3e-6
         atol = 1e-5
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index 51f49836..a3b3b442 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -9,11 +9,10 @@
 import internlm
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.scheduler import SchedulerMetricHook
 from internlm.core.trainer import TrainState
 from internlm.initialize import initialize_distributed_env
 from internlm.model.loss import FlashGPTLMLoss
-from internlm.model.metrics import AccPerplex
+from internlm.model.metrics import AccPerplex, SchedulerMetricHook
 from internlm.train import (
     get_train_data_loader,
     initialize_model,
diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
index 64ed29dd..d9c6ac81 100644
--- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py
+++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
@@ -13,10 +13,9 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.context.parallel_context import Config
-from internlm.core.scheduler import SchedulerMetricHook
 from internlm.initialize.launch import args_sanity_check
 from internlm.model.loss import FlashGPTLMLoss
-from internlm.model.metrics import AccPerplex
+from internlm.model.metrics import AccPerplex, SchedulerMetricHook
 from internlm.train import (
     get_train_data_loader,
     get_validation_data_loader,
@@ -226,10 +225,10 @@ def compute_trimmed_mean(value_list):
 
 
 def check_grad_norm(grad_norm_list):
-    standard_grad_norm_list = torch.load(os.path.join(
-        os.environ["share_path"], "quailty_assurance/small_300step_norm_grad/grad_norm_list.pt"
-    ))
-    
+    standard_grad_norm_list = torch.load(
+        os.path.join(os.environ["share_path"], "quailty_assurance/small_300step_norm_grad/grad_norm_list.pt")
+    )
+
     standard_grad_norm_list = standard_grad_norm_list[-100:]
     grad_norm_list = grad_norm_list[-100:]
     standard_grad_norm_list.sort()
@@ -239,18 +238,18 @@ def check_grad_norm(grad_norm_list):
     trimmed_mean2 = compute_trimmed_mean(grad_norm_list)
     tensor_trimmed_mean1 = torch.tensor(trimmed_mean1)
     tensor_trimmed_mean2 = torch.tensor(trimmed_mean2)
-    
+
     logger.info(f"norm_mean: {tensor_trimmed_mean1}, {tensor_trimmed_mean2}")
     assert torch.allclose(tensor_trimmed_mean1, tensor_trimmed_mean2, rtol=3e-1, atol=3e-1)
     logger.info(f"grad norm check passed")
-    
+
 
 def check_meanLoss_val(all_loss, all_val):
     loss_values1 = all_loss[0][-100:]
     loss_values2 = all_loss[1][-100:]
     loss_values1.sort()
     loss_values2.sort()
-    
+
     trimmed_mean1 = compute_trimmed_mean(loss_values1)
     trimmed_mean2 = compute_trimmed_mean(loss_values2)
     tensor_trimmed_mean1 = torch.tensor(trimmed_mean1)
@@ -261,9 +260,9 @@ def check_meanLoss_val(all_loss, all_val):
 
     assert torch.allclose(tensor_trimmed_mean1, tensor_trimmed_mean2, rtol=3e-2, atol=3e-2)
     assert torch.allclose(torch.tensor(all_val[0]), torch.tensor(all_val[1]), rtol=3e-2, atol=3e-2)
-    
+
     logger.info(f"loss check passed")
-    
+
 
 def exam_loss(args):
     # init
@@ -363,12 +362,12 @@ def exam_loss(args):
         # update parameters
         trainer_result = trainer.step()
         assert trainer_result is not None
-        
+
         _, grad_norm_groups = trainer_result
-        
+
         if gpc.is_rank_for_log():
             logger.info(f"train_grad_norm_groups: {grad_norm_groups['0_default']}")
-            grad_norm_list.append(grad_norm_groups['0_default'])
+            grad_norm_list.append(grad_norm_groups["0_default"])
 
         # evaluate on validation data loaders
         if valid_every > 0 and batch_count > 0 and (batch_count + 1) % valid_every == 0:
@@ -381,10 +380,10 @@ def exam_loss(args):
 
     torch.cuda.empty_cache()
     dist.barrier()
-    
+
     if gpc.is_rank_for_log():
         check_grad_norm(grad_norm_list)
-    
+
     return rank, loss_list, val_list
 
 
diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py
index 98a69c9f..507cace1 100644
--- a/tests/test_training/train_CI.py
+++ b/tests/test_training/train_CI.py
@@ -19,11 +19,10 @@
 import internlm  # noqa: E402
 from internlm.core.context import ParallelMode  # noqa: E402
 from internlm.core.context import global_context as gpc  # noqa: E402
-from internlm.core.scheduler import SchedulerMetricHook  # noqa: E402
 from internlm.core.trainer import TrainState  # noqa: E402
 from internlm.initialize import initialize_distributed_env  # noqa: E402
 from internlm.model.loss import FlashGPTLMLoss  # noqa: E402
-from internlm.model.metrics import AccPerplex  # noqa: E402
+from internlm.model.metrics import AccPerplex, SchedulerMetricHook  # noqa: E402
 from internlm.monitor import (  # noqa: E402
     initialize_monitor_manager,
     send_alert_message,

From f959781b318e3b8a8829d90605b0f17922cbfbf7 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 19 Jan 2024 17:53:09 +0800
Subject: [PATCH 112/153] fix(tests): fix ci test error

---
 .github/workflows/demo_in_readme.yaml     |   1 +
 configs/13B_sft.py                        | 180 ---------------------
 configs/13B_template.py                   | 180 ---------------------
 configs/20B_sft.py                        | 180 ---------------------
 configs/30B_sft.py                        | 180 ---------------------
 configs/30B_template.py                   | 180 ---------------------
 configs/7B_sft.py                         |  10 +-
 configs/7B_template.py                    | 181 ----------------------
 configs/generate.py                       |  53 -------
 tests/test_core/test_pipeline.py          |   6 +-
 tests/test_data/test_batch_sampler.py     |   6 +-
 tests/test_model/test_model_internlm.py   |   6 +-
 tests/test_training/7B_check_init.py      |   2 +-
 tests/test_utils/common_fixture.py        |   6 +-
 tests/test_utils/test_model_checkpoint.py |   3 -
 15 files changed, 19 insertions(+), 1155 deletions(-)
 delete mode 100644 configs/13B_sft.py
 delete mode 100644 configs/13B_template.py
 delete mode 100644 configs/20B_sft.py
 delete mode 100644 configs/30B_sft.py
 delete mode 100644 configs/30B_template.py
 delete mode 100644 configs/7B_template.py
 delete mode 100644 configs/generate.py

diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml
index 8445ad09..a3840347 100644
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@@ -111,6 +111,7 @@ jobs:
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
         cd ..
         rsync -av --remove-source-files $GITHUB_WORKSPACE/hf_ckpt ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
+  
   load-chat-model-in-hf:
     if: ${{ !cancelled() }}
     needs: check-requirements
diff --git a/configs/13B_sft.py b/configs/13B_sft.py
deleted file mode 100644
index e3e17ae0..00000000
--- a/configs/13B_sft.py
+++ /dev/null
@@ -1,180 +0,0 @@
-JOB_NAME = "13b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="origin_tp"),
-    pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=True,
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_template.py b/configs/13B_template.py
deleted file mode 100644
index 849c5aa9..00000000
--- a/configs/13B_template.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = {seq_len}
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/20B_sft.py b/configs/20B_sft.py
deleted file mode 100644
index 13e68b22..00000000
--- a/configs/20B_sft.py
+++ /dev/null
@@ -1,180 +0,0 @@
-JOB_NAME = "20b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="fstp", overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=True,
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_sft.py b/configs/30B_sft.py
deleted file mode 100644
index 8bde0571..00000000
--- a/configs/30B_sft.py
+++ /dev/null
@@ -1,180 +0,0 @@
-JOB_NAME = "30b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=4, fsdp=False),
-    tensor=dict(size=8, mode="fstp", overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=True,
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_template.py b/configs/30B_template.py
deleted file mode 100644
index d19ece6e..00000000
--- a/configs/30B_template.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = {seq_len}
-JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index e4028e80..66ffe0d0 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -56,8 +56,8 @@
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=10,
+    pack_sample_into_one=False,
+    total_steps=50000,
     skip_batches="",
     # rampup_batch_size (str): A string with three space-separated integers representing the
     #       starting batch size, the increment, and the number of steps between
@@ -172,9 +172,9 @@
     3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=2, fsdp=False),
-    tensor=dict(size=2, mode="mtp"),
-    pipeline=dict(size=2, interleaved_overlap=True),
+    zero1=dict(size=8, fsdp=False),
+    tensor=dict(size=1, mode="mtp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True, memory_pool=True),
 )
 
diff --git a/configs/7B_template.py b/configs/7B_template.py
deleted file mode 100644
index d78fc884..00000000
--- a/configs/7B_template.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = {seq_len}
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/generate.py b/configs/generate.py
deleted file mode 100644
index 5f044e72..00000000
--- a/configs/generate.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-import copy
-import subprocess
-
-name = "./configs/"
-root_names = ["7B_train_", "13B_train_", "30B_train_"]
-model_size = ["7B", "13B", "30B"]
-seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144]
-sp = ["none", "megatron", "flash-attn", "intern", "intern"]
-intern_overlap = [False, False, False, True, False]
-checkpoint = [False, True]
-
-for idx, root_name in enumerate(root_names):
-
-    # 指定要创建的文件夹路径
-    folder_path = name + root_name[:-1]
-
-    # 使用os.mkdir()创建文件夹
-    if not os.path.exists(folder_path):
-        os.mkdir(folder_path)
-
-    file_name = name + f"{model_size[idx]}_template.py"
-
-    with open(file_name, "r") as f:
-        lines = f.readlines()
-        origin_line = "".join(lines)
-        for seq in seq_length:
-            for i, sp_mode in enumerate(sp):
-                for ckpt in checkpoint:
-                    line = copy.copy(origin_line)
-                    line = line.replace("{seq_len}", str(seq))
-                    line = line.replace("{sp}", f"\"{sp_mode}\"")
-                    line = line.replace("{intern_overlap}", str(intern_overlap[i]))
-                    line = line.replace("{checkpoint}", str(ckpt))
-                    output_file_name = str(seq) + "_" + str(sp_mode) + "_overlap_" + str(intern_overlap[i]) + "_ckpt_" + str(ckpt) + ".py"
-                    write_file = folder_path + "/" + output_file_name
-                    with open(write_file, "w") as file:
-                        file.write(line)
-                        
-                    log_name = root_name + "_" + output_file_name[:-3]
-                    
-                    skip = True
-                    
-                    if sp_mode == "intern" and intern_overlap[i] is True:
-                        skip = False
-                    
-                    if skip:
-                        continue
-                    
-                    print(log_name)
-                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
-                    process = subprocess.Popen(command, shell=True, executable='/bin/bash')
-                    process.wait() 
\ No newline at end of file
diff --git a/tests/test_core/test_pipeline.py b/tests/test_core/test_pipeline.py
index 4b37f61b..db7b3ddd 100644
--- a/tests/test_core/test_pipeline.py
+++ b/tests/test_core/test_pipeline.py
@@ -20,9 +20,9 @@
         gradient_handler=[dict(type="PipelineSharedModuleGradientHandler")],
         parallel=dict(
             zero1=dict(size=1, fsdp=False),
-            pipeline=dict(size=8, interleaved_overlap=False),
-            sequence_parallel=False,
-            tensor=1,
+            tensor=dict(size=1, mode="mtp"),
+            pipeline=dict(size=8, interleaved_overlap=True),
+            weight=dict(size=1, overlap=True, memory_pool=True),
         ),
         model_type="INTERNLM",
         data=dict(seq_len=8, micro_num=16, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999),
diff --git a/tests/test_data/test_batch_sampler.py b/tests/test_data/test_batch_sampler.py
index eb835b2d..1faf4aee 100644
--- a/tests/test_data/test_batch_sampler.py
+++ b/tests/test_data/test_batch_sampler.py
@@ -164,9 +164,9 @@ def test_warmup(use_flash_atten_case, group_case, micro_bsz_case):
         dict(
             parallel=dict(
                 zero1=dict(size=1, fsdp=False),
-                pipeline=dict(size=1, interleaved_overlap=False),
-                sequence_parallel=False,
-                tensor=1,
+                tensor=dict(size=1, mode="mtp"),
+                pipeline=dict(size=1, interleaved_overlap=True),
+                weight=dict(size=1, overlap=True, memory_pool=True),
             ),
             data=dict(
                 train_folder=None,
diff --git a/tests/test_model/test_model_internlm.py b/tests/test_model/test_model_internlm.py
index 9b6066ec..4c239c0f 100644
--- a/tests/test_model/test_model_internlm.py
+++ b/tests/test_model/test_model_internlm.py
@@ -18,9 +18,9 @@
     dict(
         parallel=dict(
             zero1=dict(size=1, fsdp=False),
-            pipeline=dict(size=1, interleaved_overlap=False),
-            sequence_parallel=False,
-            tensor=1,
+            tensor=dict(size=1, mode="mtp"),
+            pipeline=dict(size=1, interleaved_overlap=True),
+            weight=dict(size=1, overlap=True, memory_pool=True),
         ),
         model_type="INTERNLM",
         data=dict(seq_len=2048, micro_num=1, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999),
diff --git a/tests/test_training/7B_check_init.py b/tests/test_training/7B_check_init.py
index de6dcb2a..179892b6 100644
--- a/tests/test_training/7B_check_init.py
+++ b/tests/test_training/7B_check_init.py
@@ -157,7 +157,7 @@
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=1, fsdp=False),
     tensor=4,
     pipeline=dict(size=2, interleaved_overlap=True),
     sequence_parallel=False,
diff --git a/tests/test_utils/common_fixture.py b/tests/test_utils/common_fixture.py
index 60961565..96d3188c 100644
--- a/tests/test_utils/common_fixture.py
+++ b/tests/test_utils/common_fixture.py
@@ -44,9 +44,9 @@
     dict(
         parallel=dict(
             zero1=dict(size=1, fsdp=False),
-            pipeline=dict(size=1, interleaved_overlap=False),
-            sequence_parallel=False,
-            tensor=1,
+            tensor=dict(size=1, mode="mtp"),
+            pipeline=dict(size=1, interleaved_overlap=True),
+            weight=dict(size=1, overlap=True, memory_pool=True),
         ),
         model_type="INTERNLM",
         adam=dict(
diff --git a/tests/test_utils/test_model_checkpoint.py b/tests/test_utils/test_model_checkpoint.py
index 2dcabf4e..2063591c 100644
--- a/tests/test_utils/test_model_checkpoint.py
+++ b/tests/test_utils/test_model_checkpoint.py
@@ -16,8 +16,6 @@
     LOCAL_SAVE_PATH,
     del_tmp_file,
     init_config,
-    init_dist_and_model,
-    reset_singletons,
 )
 
 # (TOTAL_STEP, CKPT_EVERY, SNPASHOT_EVERY)
@@ -164,7 +162,6 @@ def return_prefix_path(save_ckpt_folder):
 
 
 def return_latest_save_path(save_ckpt_folder, total_step, snapshot_freq, ckpt_freq):
-
     snapshot_latest_step, normal_latest_step = 0, 0
     snapshot_latest_count, normal_latest_count = 0, 0
 

From e873668a9d61de03f2d039ff599163f5767d5d70 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 19 Jan 2024 18:59:14 +0800
Subject: [PATCH 113/153] fix(tests): fix ci test error

---
 internlm/core/context/parallel_context.py     |  9 ++-
 .../core/context/process_group_initializer.py | 64 -------------------
 internlm/utils/evaluation.py                  |  8 +--
 3 files changed, 10 insertions(+), 71 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 829b2d90..3a688f9b 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -148,6 +148,7 @@ def __init__(self):
         self.data_parallel_size = 1
         self.pipeline_parallel_size = 1
         self.tensor_parallel_size = 1
+        self.weight_parallel_size = 1
         self.zero1_parallel_size = -1
         self.nettest_parallel_size = 1
         self.expert_parallel_size = -1
@@ -483,11 +484,12 @@ def init_parallel_groups(self):
 
         # the user should not set the data parallel size manually
         # instead, it should be calculated based on other parallel config
-        assert self.zero1_parallel_size >= 1
         self.sequence_parallel_size = self.tensor_parallel_size
         self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size
         self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size
         if parallel_config["tensor"]["mode"] != "isp":
+            if self.zero1_parallel_size == -1:
+                self.zero1_parallel_size = self.data_parallel_size
             assert (
                 self.zero1_parallel_size <= self.data_parallel_size
             ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}"
@@ -495,6 +497,8 @@ def init_parallel_groups(self):
                 self.data_parallel_size % self.zero1_parallel_size == 0
             ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0"
         else:
+            if self.zero1_parallel_size == -1:
+                self.zero1_parallel_size = self.weight_data_parallel_size
             assert (
                 self.zero1_parallel_size <= self.weight_data_parallel_size
             ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}"
@@ -502,6 +506,7 @@ def init_parallel_groups(self):
                 f"weight_data_parallel_size:{self.weight_data_parallel_size} % "
                 f"zero1_parallel_size: {self.zero1_parallel_size} != 0"
             )
+        assert self.zero1_parallel_size >= 1
 
         # the recommended nettest_parallel_size is 32 GPUs
         self.nettest_parallel_size = 32
@@ -538,8 +543,6 @@ def init_parallel_groups(self):
         if parallel_config["tensor"]["mode"] == "isp":
             initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Data(*initializer_args))
-        # if self.weight_parallel_size <= 1:
-        #     initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
         if parallel_config["tensor"]["mode"] != "isp":
             initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py
index 5e59df22..76d42056 100644
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@@ -112,70 +112,6 @@ def init_dist_group(self, use_cpu: bool = False):
         pass
 
 
-# class Initializer_Model(ProcessGroupInitializer):
-#     """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
-#     groups).
-
-#     Args:
-#         rank (int): The rank of current process.
-#         world_size (int): Size of whole communication world.
-#         weight_parallel_size (int): Size of model weight parallel.
-#         weight_data_parallel_size (int): Size of data parallel for common weight.
-#         sequence_parallel_size (int): Size of data sequence parallel.
-#         data_parallel_size (int): Size of data parallel.
-#         pipeline_parallel_size (int): Size of pipeline parallel.
-#         tensor_parallel_size (int): Size of tensor parallel.
-#         zero1_parallel_size (int): Size of zero1 parallel.
-#         nettest_parallel_size (int): Size of net testing parallel.
-#         expert_parallel_size (int): Size of expert parallel.
-#     """
-
-#     def __init__(self, *args, **kwargs):
-#         super().__init__(*args, **kwargs)
-
-#         # only for msp or fsp
-#         assert self.weight_parallel_size == 1
-#         self.rank_num_per_group = self.tensor_parallel_size * self.pipeline_parallel_size
-#         self.num_group = self.world_size // self.rank_num_per_group
-
-#         assert self.world_size % self.rank_num_per_group == 0
-
-#     def init_dist_group(self, use_cpu: bool = False):
-#         """Initialize model parallel groups, and assign local_ranks and groups to each gpu.
-
-#         Returns:
-#             Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-#                 A Model parallelism's information tuple.
-#         """
-#         local_rank = None
-#         ranks_in_group = None
-#         process_group = None
-#         cpu_group = None
-#         group_world_size = None
-#         mode = ParallelMode.MODEL
-
-#         for i in range(self.num_group):
-#             ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)]
-#             group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-#             if use_cpu:
-#                 group_cpu = (
-#                     dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-#                     if dist.get_backend() != "gloo"
-#                     else group
-#                 )
-#             else:
-#                 group_cpu = None
-
-#             if self.rank in ranks:
-#                 local_rank = ranks.index(self.rank)
-#                 group_world_size = len(ranks)
-#                 process_group = group
-#                 cpu_group = group_cpu
-#                 ranks_in_group = ranks
-
-#         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
-
-
 class Initializer_Pipeline(ProcessGroupInitializer):
     """A ProcessGroupInitializer for pipeline parallelism.
 
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index e85773fd..e52586b8 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -10,7 +10,7 @@
 
 
 @contextmanager
-def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, grad_accum_batch_size, metric_hook_list):
+def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, metric_hook_list):
     if not gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
         prev_data_process_func = trainer.schedule.data_process_func
         prev_grad_accum_size = trainer.schedule._grad_accum_size
@@ -50,10 +50,10 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
 def switch_sequence_parallel_mode():
     prev_mode = gpc.config.parallel.sequence_parallel
     try:
-        if gpc.config.parallel["tensor"]["sp"] == "intern":
-            gpc.config.parallel.sequence_parallel = True
-        else:
+        if gpc.config.parallel["tensor"]["mode"] == "mtp":
             gpc.config.parallel.sequence_parallel = False
+        else:
+            gpc.config.parallel.sequence_parallel = True
         yield
     finally:
         gpc.config.parallel.sequence_parallel = prev_mode

From b99a6422f84473ba58db64dd88aa2e916e3998b0 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 19 Jan 2024 19:19:22 +0800
Subject: [PATCH 114/153] fix(tests): fix ci test error

---
 internlm/core/context/parallel_context.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 3a688f9b..03fd6736 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -487,7 +487,9 @@ def init_parallel_groups(self):
         self.sequence_parallel_size = self.tensor_parallel_size
         self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size
         self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size
-        if parallel_config["tensor"]["mode"] != "isp":
+        if isinstance(parallel_config["tensor"], int) or (
+            isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] != "isp"
+        ):
             if self.zero1_parallel_size == -1:
                 self.zero1_parallel_size = self.data_parallel_size
             assert (
@@ -508,6 +510,14 @@ def init_parallel_groups(self):
             )
         assert self.zero1_parallel_size >= 1
 
+        # set sequence parallel value
+        if "sequence_parallel" not in parallel_config:
+            parallel_config._add_item("sequence_parallel", True)
+        if isinstance(parallel_config["tensor"], int) or (
+            isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "mtp"
+        ):
+            parallel_config["sequence_parallel"] = False
+
         # the recommended nettest_parallel_size is 32 GPUs
         self.nettest_parallel_size = 32
 

From 7ac53bf00f3bfb7845f5ab123991b84313879fd5 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 19 Jan 2024 19:52:08 +0800
Subject: [PATCH 115/153] fix(tests): fix ci test error

---
 internlm/core/context/parallel_context.py | 35 +++++++++++------------
 internlm/initialize/launch.py             |  2 ++
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 03fd6736..a7892434 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -487,18 +487,7 @@ def init_parallel_groups(self):
         self.sequence_parallel_size = self.tensor_parallel_size
         self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size
         self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size
-        if isinstance(parallel_config["tensor"], int) or (
-            isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] != "isp"
-        ):
-            if self.zero1_parallel_size == -1:
-                self.zero1_parallel_size = self.data_parallel_size
-            assert (
-                self.zero1_parallel_size <= self.data_parallel_size
-            ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}"
-            assert (
-                self.data_parallel_size % self.zero1_parallel_size == 0
-            ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0"
-        else:
+        if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp":
             if self.zero1_parallel_size == -1:
                 self.zero1_parallel_size = self.weight_data_parallel_size
             assert (
@@ -508,6 +497,15 @@ def init_parallel_groups(self):
                 f"weight_data_parallel_size:{self.weight_data_parallel_size} % "
                 f"zero1_parallel_size: {self.zero1_parallel_size} != 0"
             )
+        else:
+            if self.zero1_parallel_size == -1:
+                self.zero1_parallel_size = self.data_parallel_size
+            assert (
+                self.zero1_parallel_size <= self.data_parallel_size
+            ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}"
+            assert (
+                self.data_parallel_size % self.zero1_parallel_size == 0
+            ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0"
         assert self.zero1_parallel_size >= 1
 
         # set sequence parallel value
@@ -550,15 +548,14 @@ def init_parallel_groups(self):
         # run initialization of different process groups
         initializers = []
         initializers.append(pgroup_initializer.Initializer_Weight(*initializer_args))
-        if parallel_config["tensor"]["mode"] == "isp":
-            initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Data(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
-        if parallel_config["tensor"]["mode"] != "isp":
-            initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
-        else:
+        initializers.append(pgroup_initializer.Initializer_Data(*initializer_args))
+        if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp":
             initializers.append(pgroup_initializer.Initializer_Zero1_ISP(*initializer_args))
-        if isinstance(self.config.parallel.zero1, dict) and self.config.parallel.zero1.get("fsdp", False):
+        else:
+            initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
+        if isinstance(parallel_config["zero1"], dict) and parallel_config["zero1"].get("fsdp", False):
             initializers.append(pgroup_initializer.Initializer_Zero3_dp(*initializer_args))
         initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
         if self.pipeline_parallel_size > 1:
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 47b3c11c..ed3dcad5 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -324,6 +324,8 @@ def args_sanity_check():
         gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="mtp")
     if gpc.config.parallel["tensor"].get("mode", None) is None:
         gpc.config.parallel["tensor"]["mode"] = "mtp"
+    if gpc.config.parallel["tensor"]["mode"] == "isp":
+        assert not gpc.config.parallel.zero1.fsdp, "FSDP does not support isp"
     assert gpc.config.parallel["tensor"].get("mode", None) in [
         "mtp",
         "msp",

From bb5835e43a9b320c400b57a30f67fc3147086a01 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 19 Jan 2024 20:05:14 +0800
Subject: [PATCH 116/153] fix(tests): fix ci test error

---
 internlm/core/context/parallel_context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index a7892434..9cc6bcdd 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -636,7 +636,7 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False):
         # during model construction), this is because the random state will be different in different tensor parallel
         # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform
         # additional random operations during the RowParallelLinear module building process.
-        # set_mode(ParallelMode.DUMMY)
+        set_mode(ParallelMode.DUMMY)
         if self.is_using_parallel_mode(ParallelMode.TENSOR):
             set_mode(ParallelMode.TENSOR)
         if self.is_using_parallel_mode(ParallelMode.WEIGHT):

From d5872e712fe3be376ca4dc390109ab1f4fbefcc6 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 22 Jan 2024 10:54:11 +0800
Subject: [PATCH 117/153] fix(tests): fix ci test error

---
 tests/test_data/test_batch_sampler.py                 | 2 +-
 tests/test_training/test_loss.py                      | 2 +-
 tests/test_training/test_swap_nb_loss_and_gradnorm.py | 4 ++--
 tests/test_training/train_CI.py                       | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_data/test_batch_sampler.py b/tests/test_data/test_batch_sampler.py
index 1faf4aee..e756d58a 100644
--- a/tests/test_data/test_batch_sampler.py
+++ b/tests/test_data/test_batch_sampler.py
@@ -123,7 +123,7 @@ def do_warmup(args):
     # test no-packed datasets.
     for _, val_dl in val_dls.items():
         for _, batch in enumerate(val_dl):
-            if gpc.is_using_pp():
+            if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
                 total_val_bsz = len(batch[1])
                 batch[0]["input_ids"] = batch[0]["input_ids"].to(torch.bfloat16)
                 assert total_val_bsz % micro_bsz == 0
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index a3b3b442..7e694d57 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -93,7 +93,7 @@ def train(
     current_time = objs[0]
 
     # initialize model
-    model = initialize_model()
+    model, _ = initialize_model()
 
     # initialize loss function
     criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
index d9c6ac81..4d8afa28 100644
--- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py
+++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
@@ -278,7 +278,7 @@ def exam_loss(args):
     seed_all(1024)
 
     # initialize model
-    model = initialize_model()
+    model, _ = initialize_model()
 
     # initialize loss function
     criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
@@ -302,7 +302,7 @@ def exam_loss(args):
         SchedulerMetricHook(
             metric=metric,
             skip=(
-                gpc.is_using_pp()
+                gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
                 and hasattr(gpc.config.model, "num_chunks")
                 and gpc.config.model.num_chunks > 1
                 and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py
index 507cace1..a985b985 100644
--- a/tests/test_training/train_CI.py
+++ b/tests/test_training/train_CI.py
@@ -124,7 +124,7 @@ def main(args):
     uniscale_logger = initialize_llm_logger(start_time=current_time)
 
     # initialize model
-    model = initialize_model()
+    model, _ = initialize_model()
 
     with open(args.config, "r") as f:
         config_lines = f.readlines()
@@ -181,7 +181,7 @@ def main(args):
         SchedulerMetricHook(
             metric=metric,
             skip=(
-                gpc.is_using_pp()
+                gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
                 and hasattr(gpc.config.model, "num_chunks")
                 and gpc.config.model.num_chunks > 1
                 and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)

From 0aebd2c1d95a863f32eaa2b3a21528d6620d9b20 Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Mon, 22 Jan 2024 12:03:29 +0800
Subject: [PATCH 118/153] update moe config file

---
 configs/7B_MoE4_sft.py        | 62 ++++++++++++++++++++++++-----------
 internlm/initialize/launch.py |  5 ++-
 2 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py
index 0672422f..f42bcaa1 100644
--- a/configs/7B_MoE4_sft.py
+++ b/configs/7B_MoE4_sft.py
@@ -28,7 +28,7 @@
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
     # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internlm", "llama", "hf_llama".
     load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
     # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
     # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
@@ -44,8 +44,8 @@
     oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
 )
 
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
+TRAIN_FOLDER = None  # "/path/to/dataset"
+VALID_FOLDER = None  # "/path/to/dataset"
 data = dict(
     seq_len=SEQ_LEN,
     # micro_num means the number of micro_batch contained in one gradient update
@@ -59,12 +59,17 @@
     pack_sample_into_one=False,
     total_steps=50000,
     skip_batches="",
+    # rampup_batch_size (str): A string with three space-separated integers representing the
+    #       starting batch size, the increment, and the number of steps between
+    #       each increment. For example, "192 24 8" means that the batch size (micro_num)
+    #       starts at 192 and increases by 24 every 8 steps. Defaults to None.
+    #       (IMPORTANT): The interval step size is 'micro_bsz'.
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
     min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    train_folder=TRAIN_FOLDER,
+    valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=200,
     diag_outlier_ratio=1.1,
 )
 
@@ -145,23 +150,36 @@
     moe_use_residual=False,
     moe_gate_k=2,
 )
-
-# zero1 parallel:
-#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
-#         so parameters will be divided within the range of dp.
-#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
-#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-# pipeline parallel (dict):
-#     1. size: int, the size of pipeline parallel.
-#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
-# tensor parallel: tensor parallel size, usually the number of GPUs per node.
-
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
+        defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
+        msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
+        fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
+        isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+weight parallel (dict):
+    1. size: int, the size of weight parallel.
+    2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. memory_pool: bool, enable/disable memory pool, defaults to False.
+"""
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=1,
+    tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=False,
+    weight=dict(size=1, overlap=True, memory_pool=True),
 )
 
 cudnn_deterministic = False
@@ -173,6 +191,10 @@
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
         light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
     ),
 )
 
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index ce43bde8..a6030e5d 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -401,7 +401,10 @@ def args_sanity_check():
         assert (
             not optim_ckpt.overlap_sync_grad & optim_ckpt.overlap_sync_param
         ), "not support overlap and moe at the same time"
-        assert gpc.config.parallel.zero1.size == gpc.get_world_size(ParallelMode.DATA), "moe only support zero1"
+        assert gpc.config.parallel.zero1.size in (
+            -1,
+            gpc.get_world_size(ParallelMode.DATA),
+        ), "moe only support zero1, set zero1=dict(size=-1,...) can fix this"
 
 
 def launch(

From 15610f6bda988c2a4a949dad9d578d558b895c93 Mon Sep 17 00:00:00 2001
From: JiaoPL <jplzyc@163.com>
Date: Mon, 22 Jan 2024 13:27:41 +0800
Subject: [PATCH 119/153] adapt grad profiling

---
 internlm/solver/optimizer/utils.py | 143 ++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 52 deletions(-)

diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 22839adb..3dd510ff 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -237,6 +237,13 @@ def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False
     parallel_grads = []
     if fine_grained:
         parallel_grads = {}
+    
+    if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
+        param_parallel_mode = ParallelMode.TENSOR
+    elif gpc.is_using_parallel_mode(weight_parallel_mode):
+        param_parallel_mode = weight_parallel_mode
+    else:
+        param_parallel_mode = ParallelMode.TENSOR
 
     def append_grad(g, p):
         if fine_grained:
@@ -247,7 +254,7 @@ def append_grad(g, p):
         elif only_output:
             param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding"
             if (
-                gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(ParallelMode.TENSOR)
+                gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(param_parallel_mode)
                 and gpc.config.model["hidden_size"] == g.shape[1]
                 and "embedding" not in param_name.lower()
             ):
@@ -325,12 +332,27 @@ def compute_norm(
             total_norm_cuda = max(total_norm_cuda, previous_norm)
 
         # Take max across all model-parallel GPUs.
-        if gpc.get_world_size(ParallelMode.MODEL) > 1:
+        if is_tensor_data_parallel_parameter(parameters[0]):
+            if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
+                dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.TENSOR))
+        elif is_tensor_zero_parallel_parameter(parameters[0]):
+            if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
+                dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.TENSOR))
+        else:
+            if gpc.is_using_parallel_mode(weight_parallel_mode):
+                dist.all_reduce(
+                    total_norm_cuda,
+                    op=dist.ReduceOp.MAX,
+                    group=gpc.get_group(weight_parallel_mode),
+                )
+
+        if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
             dist.all_reduce(
                 total_norm_cuda,
                 op=dist.ReduceOp.MAX,
-                group=gpc.get_group(ParallelMode.MODEL),
+                group=gpc.get_group(ParallelMode.PIPELINE),
             )
+
         total_norm = total_norm_cuda[0].item()
     else:
         tensor_parallel_grads = reduce_grads(gradients, parameters, weight_parallel_mode)
@@ -417,19 +439,28 @@ def compute_vocab_grad_norm(
     norm_type=2,
     zero_mode=ParallelMode.ZERO1,
 ):
+    weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR
     enable_cuda_kernels = gradients[0].device.type == "cuda"
     # Norm parameters.
     norm_type = float(norm_type)
     vocab_size = gpc.config.model["vocab_size"]
+    
+    
+    if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
+        param_parallel_mode = ParallelMode.TENSOR
+    elif gpc.is_using_parallel_mode(weight_parallel_mode):
+        param_parallel_mode = weight_parallel_mode
+    else:
+        param_parallel_mode = ParallelMode.TENSOR
 
-    param_grads = reduce_grads(gradients, parameters, only_output=True)
+    param_grads = reduce_grads(gradients, parameters, weight_parallel_mode, only_output=True)
 
     vocab_grad_norm = torch.zeros((vocab_size,), dtype=torch.float32).to(get_current_device())
     if param_grads:
         for grad in param_grads:
             # get grad norm of each vocab
             vocab_slice_size = grad.shape[0]
-            local_tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+            local_tp_rank = gpc.get_local_rank(param_parallel_mode)
             for i in range(vocab_slice_size):
                 cur_vocab_grad_norm = get_norm([grad[i, :]], norm_type, enable_cuda_kernels)[0]
                 vocab_grad_norm[i + vocab_slice_size * local_tp_rank] += get_tensor_norm(
@@ -442,14 +473,18 @@ def compute_vocab_grad_norm(
     if previous_vocab_grad_norm is not None:
         vocab_grad_norm = vocab_grad_norm + previous_vocab_grad_norm
 
-    if gpc.is_initialized(ParallelMode.MODEL):
-        dist.all_reduce(
-            vocab_grad_norm,
-            op=dist.ReduceOp.SUM,
-            group=gpc.get_group(ParallelMode.MODEL),
-        )
+    if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
+        if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
+            dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
+    else:
+        if gpc.is_using_parallel_mode(weight_parallel_mode):
+            dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(weight_parallel_mode))
+
+    if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
+        dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE))
 
-    dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
+    if gpc.is_using_parallel_mode(zero_mode):
+        dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
 
     if zero_mode == ParallelMode.EXPERT_DATA:
         pg = gpc.get_group(ParallelMode.EXPERT)
@@ -479,11 +514,28 @@ def compute_param_metric(
     Argumemts:
         metric_type: (norm | zero_grad)
     """
+    
+    def reduce_param_metric(input_param_metrics: Dict, parallel_mode):
+        output_param_metrics = {}
+        parallel_param_metrics = [None for _ in range(gpc.get_world_size(parallel_mode))]
+        dist.all_gather_object(parallel_param_metrics, input_param_metrics, group=gpc.get_group(parallel_mode))
+        for local_param_metric in parallel_param_metrics:
+            for param_name, param_metric in local_param_metric.items():
+                if param_name not in output_param_metrics:
+                    output_param_metrics[param_name] = 0.0
+                if metric_type == "norm" and norm_type == inf:
+                    output_param_metrics[param_name] = max(
+                        output_param_metrics[param_name], param_metric
+                    )
+                else:
+                    output_param_metrics[param_name] += param_metric
+        return output_param_metrics
 
+    weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR
     enable_cuda_kernels = gradients[0].device.type == "cuda"
-    total_metrics = {}
+
     param_metrics = {}
-    param_grads = reduce_grads(gradients, parameters, fine_grained=True)
+    param_grads = reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=True)
 
     if metric_type == "norm":
         # Norm parameters.
@@ -510,65 +562,52 @@ def compute_param_metric(
             else:
                 param_metrics[key] += value
 
-    # model parallel
-    model_parallel_param_metrics = {}
-    if gpc.is_initialized(ParallelMode.MODEL):
-        parallel_param_metrics = [None for _ in range(gpc.get_world_size(ParallelMode.MODEL))]
-        dist.all_gather_object(parallel_param_metrics, param_metrics, group=gpc.get_group(ParallelMode.MODEL))
-        for local_param_metric in parallel_param_metrics:
-            for param_name, param_metric in local_param_metric.items():
-                if param_name not in model_parallel_param_metrics:
-                    model_parallel_param_metrics[param_name] = 0.0
-                if metric_type == "norm" and norm_type == inf:
-                    model_parallel_param_metrics[param_name] = max(
-                        model_parallel_param_metrics[param_name], param_metric
-                    )
-                else:
-                    model_parallel_param_metrics[param_name] += param_metric
+    # tensor parallel / weight parallel
+    if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
+        if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
+            param_metrics = reduce_param_metric(param_metrics, ParallelMode.TENSOR)
+    elif gpc.is_using_parallel_mode(weight_parallel_mode):
+         param_metrics = reduce_param_metric(param_metrics, weight_parallel_mode)
+
+    # pipeline parallel
+    if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
+        param_metrics = reduce_param_metric(param_metrics, ParallelMode.PIPELINE)
 
     # zero parallel
-    zero_param_metrics = [None for _ in range(gpc.get_world_size(zero_mode))]
-    dist.all_gather_object(zero_param_metrics, model_parallel_param_metrics, group=gpc.get_group(zero_mode))
-    for local_param_metric in zero_param_metrics:
-        for param_name, param_metric in local_param_metric.items():
-            if param_name not in total_metrics:
-                total_metrics[param_name] = 0.0
-            if metric_type == "norm" and norm_type == inf:
-                total_metrics[param_name] = max(total_metrics[param_name], param_metric)
-            else:
-                total_metrics[param_name] += param_metric
+    if gpc.is_using_parallel_mode(zero_mode):
+        param_metrics = reduce_param_metric(param_metrics, zero_mode)
 
     # moe
     if zero_mode == ParallelMode.EXPERT_DATA:
         pg = gpc.get_group(ParallelMode.EXPERT)
-        total_metric_values = list(total_metrics.values())
-        if isinstance(total_metric_values[0], torch.Tensor):
-            scaled_param_metric = torch.stack(total_metric_values).to(device=get_current_device())
+        param_metric_values = list(param_metrics.values())
+        if isinstance(param_metric_values[0], torch.Tensor):
+            scaled_param_metric = torch.stack(param_metric_values).to(device=get_current_device())
         else:
-            scaled_param_metric = torch.cuda.FloatTensor(total_metric_values, device=get_current_device())
+            scaled_param_metric = torch.cuda.FloatTensor(param_metric_values, device=get_current_device())
         scaled_param_metric = scaled_param_metric / float(gpc.get_world_size(ParallelMode.EXPERT))
         dist.all_reduce(scaled_param_metric, group=pg)
-        for i, param_name in enumerate(total_metrics.keys()):
-            total_metrics[param_name] = scaled_param_metric[i]
+        for i, param_name in enumerate(param_metrics.keys()):
+            param_metrics[param_name] = scaled_param_metric[i]
 
     # calc zero grad percent
     if metric_type == "zero_grad":
-        for param_name, param_metric in total_metrics.items():
-            total_metrics[param_name] = (param_metric[0] / param_metric[1]).item()
+        for param_name, param_metric in param_metrics.items():
+            param_metrics[param_name] = (param_metric[0] / param_metric[1]).item()
 
     # scale norm
     if metric_type == "norm":
-        for param_name, param_metric in total_metrics.items():
+        for param_name, param_metric in param_metrics.items():
             if torch.is_tensor(param_metric):
                 param_metric = param_metric.item()
             if param_metric in (inf, -inf):
-                total_metrics[param_name] = -1
+                param_metrics[param_name] = -1
             elif math.isnan(param_metric):
-                total_metrics[param_name] = -2
+                param_metrics[param_name] = -2
             else:
-                total_metrics[param_name] = param_metric
+                param_metrics[param_name] = param_metric
 
-    return total_metrics
+    return param_metrics
 
 
 def compute_param_norm(

From c8b100ed6f6dba0c0dcb182e64b8dd9e21be670c Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 22 Jan 2024 14:09:24 +0800
Subject: [PATCH 120/153] fix(communication/isp.py): fix bias switch for mem
 pool

---
 internlm/core/communication/isp.py     | 2 +-
 internlm/model/__init__.py             | 3 ++-
 internlm/model/multi_head_attention.py | 7 ++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index 0640f981..d0bbe2ed 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -209,7 +209,7 @@ def __init__(
 
         # init memory pool if necessary.
         if self.enable_memory_pool:
-            self.memory_pool = MemoryPool(model_conf)
+            self.memory_pool = MemoryPool(model_conf, with_bias=True)
         else:
             self.memory_pool = None
 
diff --git a/internlm/model/__init__.py b/internlm/model/__init__.py
index a4efc033..c10552c3 100644
--- a/internlm/model/__init__.py
+++ b/internlm/model/__init__.py
@@ -8,7 +8,7 @@
 from .modeling_llama import build_model_with_cfg as build_model_with_llama_cfg
 from .modeling_moe import build_model_with_moe_cfg
 from .moe import MoE
-from .multi_head_attention import MHA
+from .multi_head_attention import MHA, DistributedAttention
 from .utils import gather_forward_split_backward
 
 __all__ = [
@@ -20,6 +20,7 @@
     "ScaleColumnParallelLinear",
     "AccPerplex",
     "MHA",
+    "DistributedAttention",
     "gather_forward_split_backward",
     "build_model_with_cfg",
     "build_model_with_moe_cfg",
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 87e2d42a..825e3f21 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -187,6 +187,7 @@ def __init__(
         self.num_heads = num_heads
         assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
         self.head_dim = self.embed_dim // num_heads
+        self.tp_mode = tp_mode
 
         if self.rotary_emb_dim > 0:
             if self.use_dynamic_ntk_rope:
@@ -204,7 +205,7 @@ def __init__(
                 )
 
         # notice here should change bias=True
-        Wqkv_cls = get_linear_cls(tp_mode, "column")
+        Wqkv_cls = get_linear_cls(self.tp_mode, "column")
         self.Wqkv = Wqkv_cls(
             embed_dim,
             3 * embed_dim,
@@ -220,14 +221,14 @@ def __init__(
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        if tp_mode == "isp":
+        if self.tp_mode == "isp":
             self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=sequence_process_group)
             self.inner_cross_attn = DistributedAttention(
                 self.inner_cross_attn, sequence_process_group=sequence_process_group
             )
 
         # output projection always have the bias (for now)
-        out_proj_cls = get_linear_cls(tp_mode, "row")
+        out_proj_cls = get_linear_cls(self.tp_mode, "row")
         self.out_proj = out_proj_cls(
             embed_dim,
             embed_dim,

From c606bb57a86238187ff636957eab4f12a5f1477b Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 22 Jan 2024 14:43:07 +0800
Subject: [PATCH 121/153] fix(model/utils.py): fix boolean value ambiguous
 error

---
 internlm/model/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 48eb4b78..c6ae7002 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -603,7 +603,7 @@ def backward(ctx, grad_output, *args):
         if ctx.needs_input_grad[1]:
             if grad_weight_sync:
                 grad_weight_sync.wait()
-            if grad_bias and grad_bias_sync:
+            if grad_bias is not None and grad_bias_sync is not None:
                 grad_bias_sync.wait()
 
         return grad_input, grad_weight, grad_bias, None, None, None, None, None, None

From 70a17d62435c19a571df03316da73591630ad28c Mon Sep 17 00:00:00 2001
From: JiaoPL <jplzyc@163.com>
Date: Mon, 22 Jan 2024 17:39:33 +0800
Subject: [PATCH 122/153] test grad profiling with mtp,msp,fsp,isp

---
 .../solver/optimizer/hybrid_zero_optim.py     | 79 ++++++++++++++++++-
 internlm/solver/optimizer/utils.py            | 25 +++---
 internlm/train/training_internlm.py           |  2 +-
 3 files changed, 91 insertions(+), 15 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index dac0453c..7cdbe3ff 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -661,13 +661,26 @@ def _compute_param_norm_stage(
     ):
         # compute norm for gradients that have been reduced
         params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
-
+        params_is_padding = False
         total_param_norms = {}
         if len(params) == 0:
+            params_is_padding = True
             dtype = self.param_groups[group_id]["dtype"]
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
+            if group_id == 0:
+                for param in params:
+                    if self.use_isp:
+                        setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
+                    else:
+                        setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
+            elif group_id == 1:
+                for param in params:
+                    setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            else:
+                raise NotImplementedError("group_id > 1 is not yet implemented.")
+
         if self._clip_grad_norm > 0:
             total_param_norms = compute_param_norm(
                 grads,
@@ -676,17 +689,43 @@ def _compute_param_norm_stage(
                 previous_param_norms=previous_param_norms,
                 zero_mode=self._broadcast_parallel_mode[group_id],
             )
+
+        if params_is_padding:
+            for param in params:
+                if hasattr(param, IS_REPLICA_ZERO_PARALLEL):
+                    delattr(param, IS_REPLICA_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_DATA_PARALLEL)
+                if hasattr(param, IS_TENSOR_ZERO_PARALLEL):
+                    delattr(param, IS_TENSOR_ZERO_PARALLEL)
+                if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
+                    delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+
         return total_param_norms
 
     def _compute_vocab_grad_norm_stage(
         self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_vocab_grad_norm=None
     ):
         params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
+        params_is_padding = False
         if len(params) == 0:
+            params_is_padding = True
             dtype = self.param_groups[group_id]["dtype"]
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
+            if group_id == 0:
+                for param in params:
+                    if self.use_isp:
+                        setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
+                    else:
+                        setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
+            elif group_id == 1:
+                for param in params:
+                    setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            else:
+                raise NotImplementedError("group_id > 1 is not yet implemented.")
+
         vocab_grad_norm = None
 
         if self._clip_grad_norm > 0:
@@ -698,20 +737,44 @@ def _compute_vocab_grad_norm_stage(
                 zero_mode=self._broadcast_parallel_mode[group_id],
             )
 
+        if params_is_padding:
+            for param in params:
+                if hasattr(param, IS_REPLICA_ZERO_PARALLEL):
+                    delattr(param, IS_REPLICA_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_DATA_PARALLEL)
+                if hasattr(param, IS_TENSOR_ZERO_PARALLEL):
+                    delattr(param, IS_TENSOR_ZERO_PARALLEL)
+                if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
+                    delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+
         return vocab_grad_norm
 
     def _count_zero_grads_stage(
         self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_zero_grad_count=None
     ):
         params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
-
+        params_is_padding = False
         total_zero_grad_count = {}
 
         if len(params) == 0:
+            params_is_padding = True
             dtype = self.param_groups[group_id]["dtype"]
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
+            if group_id == 0:
+                for param in params:
+                    if self.use_isp:
+                        setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
+                    else:
+                        setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
+            elif group_id == 1:
+                for param in params:
+                    setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            else:
+                raise NotImplementedError("group_id > 1 is not yet implemented.")
+
         if self._clip_grad_norm > 0:
             total_zero_grad_count = compute_zero_grad_count(
                 grads,
@@ -720,6 +783,18 @@ def _count_zero_grads_stage(
                 previous_zero_grad_count=previous_zero_grad_count,
                 zero_mode=self._broadcast_parallel_mode[group_id],
             )
+
+        if params_is_padding:
+            for param in params:
+                if hasattr(param, IS_REPLICA_ZERO_PARALLEL):
+                    delattr(param, IS_REPLICA_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_DATA_PARALLEL)
+                if hasattr(param, IS_TENSOR_ZERO_PARALLEL):
+                    delattr(param, IS_TENSOR_ZERO_PARALLEL)
+                if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
+                    delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+
         return total_zero_grad_count
 
     @llm_timeout(func_name="optim_step")
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 3dd510ff..a1964282 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -237,7 +237,7 @@ def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False
     parallel_grads = []
     if fine_grained:
         parallel_grads = {}
-    
+
     if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
         param_parallel_mode = ParallelMode.TENSOR
     elif gpc.is_using_parallel_mode(weight_parallel_mode):
@@ -444,8 +444,7 @@ def compute_vocab_grad_norm(
     # Norm parameters.
     norm_type = float(norm_type)
     vocab_size = gpc.config.model["vocab_size"]
-    
-    
+
     if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
         param_parallel_mode = ParallelMode.TENSOR
     elif gpc.is_using_parallel_mode(weight_parallel_mode):
@@ -478,10 +477,10 @@ def compute_vocab_grad_norm(
             dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
     else:
         if gpc.is_using_parallel_mode(weight_parallel_mode):
-            dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(weight_parallel_mode))
+            dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(weight_parallel_mode))
 
     if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
-        dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE))
+        dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE))
 
     if gpc.is_using_parallel_mode(zero_mode):
         dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
@@ -514,7 +513,7 @@ def compute_param_metric(
     Argumemts:
         metric_type: (norm | zero_grad)
     """
-    
+
     def reduce_param_metric(input_param_metrics: Dict, parallel_mode):
         output_param_metrics = {}
         parallel_param_metrics = [None for _ in range(gpc.get_world_size(parallel_mode))]
@@ -524,9 +523,7 @@ def reduce_param_metric(input_param_metrics: Dict, parallel_mode):
                 if param_name not in output_param_metrics:
                     output_param_metrics[param_name] = 0.0
                 if metric_type == "norm" and norm_type == inf:
-                    output_param_metrics[param_name] = max(
-                        output_param_metrics[param_name], param_metric
-                    )
+                    output_param_metrics[param_name] = max(output_param_metrics[param_name], param_metric)
                 else:
                     output_param_metrics[param_name] += param_metric
         return output_param_metrics
@@ -563,11 +560,15 @@ def reduce_param_metric(input_param_metrics: Dict, parallel_mode):
                 param_metrics[key] += value
 
     # tensor parallel / weight parallel
-    if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
+    if is_tensor_data_parallel_parameter(parameters[0]):
         if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
             param_metrics = reduce_param_metric(param_metrics, ParallelMode.TENSOR)
-    elif gpc.is_using_parallel_mode(weight_parallel_mode):
-         param_metrics = reduce_param_metric(param_metrics, weight_parallel_mode)
+    elif is_tensor_zero_parallel_parameter(parameters[0]):
+        if gpc.is_using_parallel_mode(ParallelMode.TENSOR):
+            param_metrics = reduce_param_metric(param_metrics, ParallelMode.TENSOR)
+    else:
+        if gpc.is_using_parallel_mode(weight_parallel_mode):
+            param_metrics = reduce_param_metric(param_metrics, weight_parallel_mode)
 
     # pipeline parallel
     if gpc.is_using_parallel_mode(ParallelMode.PIPELINE):
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index d5f124f4..108a918b 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -62,7 +62,7 @@
 from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
-from internlm.utils.common import DummyProfile, get_current_device
+from internlm.utils.common import DummyProfile, get_current_device, launch_time
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (

From 4e9b27664fa740df38008eb5f1462129b1341a68 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 22 Jan 2024 18:15:14 +0800
Subject: [PATCH 123/153] feat(training_internlm.py): update initialize_model
 func to adapt to private repo

---
 internlm/train/training_internlm.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index d5f124f4..b6db7b22 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -31,7 +31,7 @@
 )
 from internlm.core.context import global_context as gpc
 from internlm.core.context.random import set_mode
-from internlm.core.naive_amp import NaiveAMPModel
+from internlm.core.naive_amp import NaiveAMPModel, set_fp32_attr_to_module
 from internlm.core.trainer import TrainState
 from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
 from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
@@ -81,7 +81,17 @@
 logger = get_logger(__file__)
 
 
-def set_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]):
+def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]):
+    if not isinstance(model, nn.ModuleList):
+        model = [model]
+
+    for _chunk in model:
+        for _, module in _chunk.named_modules():
+            if isinstance(module, (RMSNorm, nn.LayerNorm)) and gpc.config.model.get("use_fp32_norm", False):
+                set_fp32_attr_to_module(module)
+
+
+def set_parallel_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]):
     def _check_module(module):
         # layer_norm
         if isinstance(module, (RMSNorm, nn.LayerNorm)):
@@ -111,6 +121,7 @@ def _check_module(module):
         if isinstance(_chunk, NaiveAMPModel):
             _chunk = _chunk.model
 
+        # set param parallel attribute
         for name, module in _chunk.named_modules():
             _check_module(module)
 
@@ -124,7 +135,7 @@ def _check_module(module):
 
 
 @llm_timeout(func_name="initialize_model")
-def initialize_model():
+def initialize_model(pre_process_func: Optional[Callable] = None, post_process_func: Optional[Callable] = None):
     """
     Initialize model with Automatic Mixed Precision.
 
@@ -132,8 +143,15 @@ def initialize_model():
         torch.nn.Module:
             The neural network model to be trained or evaluated.
     """
-
+    if pre_process_func:
+        pre_process_output = pre_process_func()
     model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
+    if post_process_func:
+        post_process_func(pre_process_output)
+
+    # should be set before NaiveAMPModel
+    set_fp32_attr_for_model(model)
+
     if isinstance(model, nn.ModuleList):
         model = nn.ModuleList(
             [
@@ -154,7 +172,7 @@ def initialize_model():
             sync_buffer=False,
         )
 
-    set_attr_for_param_groups(model)
+    set_parallel_attr_for_param_groups(model)
 
     # This sync is very important, cause the model weights kept in optimizer are copied
     # from the origin parameters in the memory, so we should make sure the dp sync

From 32df5ad2cdb2e30a33fc6b10b4e57f9ed8f775af Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 22 Jan 2024 19:42:19 +0800
Subject: [PATCH 124/153] feat(training_internlm.py): move get_scheduler_hooks
 from train.py to training_internlm.py

---
 internlm/model/modeling_internlm.py |  1 +
 internlm/train/__init__.py          |  2 ++
 internlm/train/training_internlm.py | 28 ++++++++++++++++++++++++++--
 train.py                            | 23 +----------------------
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 01f647e1..6a237641 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -507,6 +507,7 @@ def build_model_with_cfg(
     dropout_selective_checkpoint=True,
     use_scaled_init: bool = True,
     use_swiglu: bool = True,
+    use_fp32_norm: bool = True,
     use_flash_attn: bool = True,
     rope_base: int = 10000,
 ):
diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py
index 1fd08028..90bb5d86 100644
--- a/internlm/train/__init__.py
+++ b/internlm/train/__init__.py
@@ -7,6 +7,7 @@
     load_new_batch,
     record_current_batch_training_metrics,
     wrap_FSDP_model,
+    get_scheduler_hooks,
 )
 
 __all__ = [
@@ -18,4 +19,5 @@
     "load_new_batch",
     "record_current_batch_training_metrics",
     "wrap_FSDP_model",
+    "get_scheduler_hooks",
 ]
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index b6db7b22..88d404a8 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -6,7 +6,7 @@
 import pickle
 import time
 from functools import partial
-from typing import Callable, Iterable, Optional, Union
+from typing import Callable, Iterable, Optional, Union, List
 
 import torch
 import torch.distributed as dist
@@ -62,7 +62,7 @@
 from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
-from internlm.utils.common import DummyProfile, get_current_device
+from internlm.utils.common import DummyProfile, get_current_device, SchedulerHook
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
@@ -76,6 +76,8 @@
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.timeout import llm_timeout
+from internlm.model.metrics import SchedulerMetricHook
+from internlm.core.communication.isp import ISPCommunicatorSchedulerHook
 
 RMSNorm = try_import_RMSNorm()
 logger = get_logger(__file__)
@@ -303,6 +305,28 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList], isp_communicato
     return optimizer, beta2_scheduler, lr_scheduler
 
 
+def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerHook]:
+    scheduler_hooks: List[SchedulerHook] = []
+
+    if metric is not None:
+        scheduler_hooks.append(
+            SchedulerMetricHook(
+                metric=metric,
+                skip=(
+                    gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
+                    and hasattr(gpc.config.model, "num_chunks")
+                    and gpc.config.model.num_chunks > 1
+                    and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
+                ),
+            ),
+        )
+
+    if isp_communicator is not None:
+        scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim))
+
+    return scheduler_hooks
+
+
 @llm_timeout(func_name="get_train_data_loader")
 def get_train_data_loader(num_worker: int = 0, dataset_generate_func: Optional[Callable] = None):
     """
diff --git a/train.py b/train.py
index 46775ac9..bd931890 100644
--- a/train.py
+++ b/train.py
@@ -29,6 +29,7 @@
     initialize_optimizer,
     load_new_batch,
     record_current_batch_training_metrics,
+    get_scheduler_hooks,
 )
 from internlm.utils.common import (
     BatchSkipper,
@@ -71,28 +72,6 @@ def initialize_llm_logger(start_time: str):
     return uniscale_logger
 
 
-def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerHook]:
-    scheduler_hooks: List[SchedulerHook] = []
-
-    if metric is not None:
-        scheduler_hooks.append(
-            SchedulerMetricHook(
-                metric=metric,
-                skip=(
-                    gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
-                    and hasattr(gpc.config.model, "num_chunks")
-                    and gpc.config.model.num_chunks > 1
-                    and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
-                ),
-            ),
-        )
-
-    if isp_communicator is not None:
-        scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim))
-
-    return scheduler_hooks
-
-
 def main(args):
     # init setting
     skip_batches = gpc.config.data.skip_batches

From d388ddc2ecf5ad633d73fe6f58536ccf2bf1d96f Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 23 Jan 2024 11:05:06 +0800
Subject: [PATCH 125/153] feat(model): fix dict has no attri mode error

---
 internlm/model/modeling_internlm.py            |  4 +++-
 internlm/solver/optimizer/hybrid_zero_optim.py |  5 ++++-
 internlm/solver/optimizer/utils.py             |  4 +++-
 internlm/train/training_internlm.py            | 16 ++++++++++------
 internlm/train/utils.py                        |  2 +-
 internlm/utils/model_checkpoint.py             | 16 ++++++++--------
 internlm/utils/parallel.py                     | 14 +++++++++-----
 7 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 6a237641..934981f2 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -307,7 +307,9 @@ def __init__(
         super().__init__()
 
         checkpoint_layer_num = int(num_layers * checkpoint)
-        self.tp_mode = gpc.config.parallel.tensor.mode
+        self.tp_mode = "mtp"
+        if isinstance(gpc.config.parallel.tensor, dict):
+            self.tp_mode = gpc.config.parallel.tensor.get("mode", "mtp")
 
         if is_reward:
             head_cls = RewardModelLinear
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index dac0453c..056bb6de 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -84,7 +84,10 @@ def __init__(
         clip_grad_norm = zero_cfg.clip_grad_norm
         self._overlap_sync_grad = zero_cfg.overlap_sync_grad
         self._overlap_sync_param = zero_cfg.overlap_sync_param
-        self.use_isp = gpc.config.parallel.tensor.mode == "isp"
+        self.use_isp = (
+            isinstance(gpc.config.parallel["tensor"], dict)
+            and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp"
+        )
 
         super().__init__(optim=optimizer)
 
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index 22839adb..31073336 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -308,7 +308,9 @@ def compute_norm(
         Total norm of the parameters, need total_norm**(1/norm) before using.
     """
 
-    weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR
+    weight_parallel_mode = (
+        ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR
+    )
     enable_cuda_kernels = gradients[0].device.type == "cuda"
     # Norm parameters.
     norm_type = float(norm_type)
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 88d404a8..17e42418 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -94,6 +94,8 @@ def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]):
 
 
 def set_parallel_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]):
+    tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp")
+
     def _check_module(module):
         # layer_norm
         if isinstance(module, (RMSNorm, nn.LayerNorm)):
@@ -103,17 +105,17 @@ def _check_module(module):
         # embedding and head
         if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)):
             for param in module.parameters():
-                if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp":
+                if gpc.is_initialized(ParallelMode.TENSOR) and tp_mode == "isp":
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
-                elif gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp":
+                elif gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp":
                     setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
 
         # for linear module
         if isinstance(module, (ColumnParallelLinear, RowParallelLinear)):
             for param in module.parameters():
-                if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp":
+                if gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp":
                     setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
-                elif gpc.is_initialized(ParallelMode.WEIGHT) and gpc.config.parallel.tensor.mode == "isp":
+                elif gpc.is_initialized(ParallelMode.WEIGHT) and tp_mode == "isp":
                     setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
 
     if not isinstance(model, nn.ModuleList):
@@ -187,13 +189,15 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f
 
     # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
     # state in the same dp group are all the same.
-    random_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA
+    random_mode = (
+        ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA
+    )
     set_mode(random_mode)
 
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
 
-    if gpc.config.parallel.tensor.mode != "isp":
+    if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp":
         isp_communicator = None
     else:
         isp_communicator = ISPCommunicator(
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index ed4b7415..97f49f74 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -126,7 +126,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         pgroup["optimizer_mode"] = ParallelMode.ZERO1
 
     # param groups may contain empty groups, such as embed_head
-    if gpc.config.parallel.tensor.mode == "isp":
+    if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
         param_groups.extend(new_groups.values())
     else:
         assert len(new_groups["embed_head"]["params"]) <= 0
diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
index 322ddf1e..62ee66aa 100644
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@@ -325,7 +325,7 @@ def save_model_checkpoint(folder, model):
         # even if pp is not considered, it will definitely not be written on the same machine.
 
         # for tensor parallel mode with isp
-        if gpc.config.parallel.tensor.mode == "isp":
+        if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
             if wdp_rank == 0 or dp_rank == 0:
                 fn = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt"
                 fp = os.path.join(folder, fn)
@@ -564,7 +564,7 @@ def load_model_checkpoint(folder, model):
     for fn in fns:
         if fn.startswith("model_t") and not fn.endswith(".md5"):
             segements = os.path.splitext(fn)[0].split("_")
-            if gpc.config.parallel.tensor.mode == "isp":
+            if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
                 max_pp = max(max_pp, int(segements[-1][2:]))
                 max_wp = max(max_wp, int(segements[-2][2:]))
                 max_tp = max(max_tp, int(segements[-3][2:]))
@@ -590,7 +590,7 @@ def load_model_checkpoint(folder, model):
             dp_size == max_zo + 1
         ), f"The weights are save for {max_zo+1} FSDP shards , while current has {dp_size} FSDP shards"
 
-    if gpc.config.parallel.tensor.mode == "isp":
+    if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
         should_load_name = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt"
     elif gpc.config.parallel.zero1.fsdp:
         should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
@@ -702,7 +702,7 @@ def save_optimizer_checkpoint(optim, state_path):
 
     states = optim.state_dict()
     if isinstance(optim, HybridZeroOptimizer):
-        if gpc.config.parallel.tensor.mode == "isp":
+        if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
             fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
             llm_save(os.path.join(state_path, fp), states)
         else:
@@ -752,7 +752,7 @@ def load_optimizer_checkpoint(folder, optim):
     max_tp, max_wp, max_pp, max_zero, max_dp = 0, 0, 0, 0, 0
     for fn in fns:
         if fn.startswith("optimizer_") and not fn.endswith(".md5"):
-            if gpc.config.parallel.tensor.mode == "isp":
+            if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
                 _, tp, wp, pp, dp = os.path.splitext(fn)[0].split("_")
                 max_dp = max(max_dp, int(dp[2:]))
                 max_tp = max(max_tp, int(tp[2:]))
@@ -770,12 +770,12 @@ def load_optimizer_checkpoint(folder, optim):
     pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
     dp_size = gpc.get_world_size(ParallelMode.DATA)
 
-    if gpc.config.parallel.tensor.mode == "isp":
+    if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
         assert dp_size == max_dp + 1, (
             f"The optimizer states are save for {max_dp+1} data parallelism, "
             f"while current has {dp_size} data parallelism"
         )
-    if gpc.config.parallel.tensor.mode != "isp":
+    if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp":
         assert zero_size == max_zero + 1, (
             f"The optimizer states are save for {max_zero+1} zero parallel, "
             f"while current has {zero_size} zero broadcast range."
@@ -795,7 +795,7 @@ def load_optimizer_checkpoint(folder, optim):
     wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
     pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
     dp_rank = gpc.get_local_rank(ParallelMode.DATA)
-    if gpc.config.parallel.tensor.mode == "isp":
+    if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
         fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
     else:
         fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt"
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index 2614fe11..703c5dd7 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -26,7 +26,7 @@ def is_replica_zero_parallel_parameter(p):
 def is_tensor_data_parallel_parameter(p):
     return (
         gpc.is_initialized(ParallelMode.TENSOR)
-        and gpc.config.parallel.tensor.mode == "isp"
+        and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp"
         and hasattr(p, IS_TENSOR_DATA_PARALLEL)
         and getattr(p, IS_TENSOR_DATA_PARALLEL)
     )
@@ -35,7 +35,7 @@ def is_tensor_data_parallel_parameter(p):
 def is_tensor_zero_parallel_parameter(p):
     return (
         gpc.is_initialized(ParallelMode.TENSOR)
-        and gpc.config.parallel.tensor.mode != "isp"
+        and gpc.config.parallel["tensor"].get("mode", "mtp") != "isp"
         and hasattr(p, IS_TENSOR_ZERO_PARALLEL)
         and getattr(p, IS_TENSOR_ZERO_PARALLEL)
     )
@@ -44,7 +44,7 @@ def is_tensor_zero_parallel_parameter(p):
 def is_weight_zero_parallel_parameter(p):
     return (
         gpc.is_initialized(ParallelMode.WEIGHT)
-        and gpc.config.parallel.tensor.mode == "isp"
+        and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp"
         and hasattr(p, IS_WEIGHT_ZERO_PARALLEL)
         and getattr(p, IS_WEIGHT_ZERO_PARALLEL)
     )
@@ -58,7 +58,9 @@ def sync_model_param(model):
     """
 
     sync_moe_param = gpc.is_using_parallel_mode(ParallelMode.EXPERT_DATA)
-    sync_parallel_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA
+    sync_parallel_mode = (
+        ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA
+    )
     for param in model.parameters():
         if sync_moe_param and getattr(param, "is_expert", False):
             ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA)
@@ -79,7 +81,9 @@ def sync_model_replica_param_group(model):
         model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
     """
 
-    parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.TENSOR
+    parallel_mode = (
+        ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR
+    )
     if gpc.is_using_parallel_mode(parallel_mode):
         for param in model.parameters():
             if is_replica_zero_parallel_parameter(param):

From 8e1b6199386b7da97714e1e1633a282cad4b2fa7 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 23 Jan 2024 15:28:41 +0800
Subject: [PATCH 126/153] feat(training_internlm.py): move use_fp32_norm config
 to gpc.config

---
 configs/7B_sft.py                   | 1 +
 internlm/model/modeling_internlm.py | 1 -
 internlm/train/training_internlm.py | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 66ffe0d0..615cd6c3 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -129,6 +129,7 @@
     cur_iter=-1,
 )
 
+use_fp32_norm = False
 model = dict(
     checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
     num_attention_heads=NUM_ATTENTION_HEAD,
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 934981f2..8ba10d0e 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -509,7 +509,6 @@ def build_model_with_cfg(
     dropout_selective_checkpoint=True,
     use_scaled_init: bool = True,
     use_swiglu: bool = True,
-    use_fp32_norm: bool = True,
     use_flash_attn: bool = True,
     rope_base: int = 10000,
 ):
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 17e42418..f5e2712b 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -89,7 +89,7 @@ def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]):
 
     for _chunk in model:
         for _, module in _chunk.named_modules():
-            if isinstance(module, (RMSNorm, nn.LayerNorm)) and gpc.config.model.get("use_fp32_norm", False):
+            if isinstance(module, (RMSNorm, nn.LayerNorm)) and gpc.config.get("use_fp32_norm", False):
                 set_fp32_attr_to_module(module)
 
 

From 978cea8b6dacb0db2cdc8f6c5bb4aa1f8c827e4c Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 24 Jan 2024 12:38:46 +0800
Subject: [PATCH 127/153] feat(version): update internevo version and torch
 verion

---
 requirements/runtime.txt | 2 +-
 requirements/torch.txt   | 8 ++++----
 version.txt              | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 814f69bb..d5922304 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -14,4 +14,4 @@ botocore
 torch-scatter
 pyecharts
 py-libnuma
--f https://data.pyg.org/whl/torch-1.13.1+cu117.html
+-f https://data.pyg.org/whl/torch-2.1.0+cu118.html
diff --git a/requirements/torch.txt b/requirements/torch.txt
index 4b1efcb7..c9a04b3d 100644
--- a/requirements/torch.txt
+++ b/requirements/torch.txt
@@ -1,4 +1,4 @@
---extra-index-url https://download.pytorch.org/whl/cu117
-torch==1.13.1+cu117
-torchvision==0.14.1+cu117
-torchaudio==0.13.1
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.1.0+cu118
+torchvision==0.16.0+cu118
+torchaudio==2.1.0+cu118
diff --git a/version.txt b/version.txt
index 0ea3a944..0d91a54c 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.0
+0.3.0

From d5fe8fe6873e5515722e574d791f0ecc369b2181 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 24 Jan 2024 13:09:21 +0800
Subject: [PATCH 128/153] feat(context/parallel_context.py): set default
 parallel size in parallel context to fix e2e tests

---
 internlm/core/context/parallel_context.py | 11 +++++++++++
 internlm/initialize/launch.py             | 10 +---------
 internlm/model/modeling_internlm.py       |  4 ++--
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 9cc6bcdd..faa49dff 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -477,6 +477,17 @@ def init_parallel_groups(self):
         # set parallel size as attributes for global context
         parallel_config = self.config.get("parallel", None)
         if parallel_config is not None:
+            # set default value for parallel size
+            if "zero1" not in parallel_config:
+                parallel_config._add_item("zero1", dict(size=-1, fsdp=False))
+            if "pipeline" not in parallel_config:
+                parallel_config._add_item("pipeline", dict(size=1, interleaved_overlap=False))
+            if "tensor" not in parallel_config:
+                parallel_config._add_item("tensor", dict(size=1, mode="mtp"))
+            if "weight" not in parallel_config:
+                parallel_config._add_item("weight", dict(size=1, overlap=False, memory_pool=False))
+
+            # get value from config
             self._set_parallel_size_from_config(parallel_config, "weight", "weight_parallel_size")
             self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size")
             self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size")
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index ed3dcad5..041445ca 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -466,7 +466,7 @@ def launch(
         logger.info(
             f"Distributed environment is initialized, "
             f"data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, "
-            f"tensor parallel size: {gpc.tensor_parallel_size}",
+            f"tensor parallel size: {gpc.tensor_parallel_size}, weight parallel size: {gpc.weight_parallel_size}",
         )
         if gpc.config.model.get("num_experts", 1) > 1:
             logger.info(
@@ -475,14 +475,6 @@ def launch(
                 f"number of local experts: {gpc.config.model.num_experts//gpc.expert_parallel_size}"
             )
 
-    print(
-        f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} "
-        f"sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} pp_rank:{gpc.get_local_rank(ParallelMode.PIPELINE)} "
-        f"zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} "
-        f"wdp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}",
-        flush=True,
-    )
-
 
 def launch_from_slurm(
     config: Union[str, Path, Config, Dict],
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 8ba10d0e..f06d6532 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -308,8 +308,8 @@ def __init__(
 
         checkpoint_layer_num = int(num_layers * checkpoint)
         self.tp_mode = "mtp"
-        if isinstance(gpc.config.parallel.tensor, dict):
-            self.tp_mode = gpc.config.parallel.tensor.get("mode", "mtp")
+        if isinstance(gpc.config.parallel["tensor"], dict):
+            self.tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp")
 
         if is_reward:
             head_cls = RewardModelLinear

From 1d64a22b52fe21435dab780365f1a71b6d068716 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 24 Jan 2024 14:49:58 +0800
Subject: [PATCH 129/153] feat(format): fix ci lint check error

---
 internlm/initialize/initialize_trainer.py |  2 +-
 internlm/model/metrics.py                 |  2 +-
 internlm/model/modeling_llama.py          | 23 +---------------
 internlm/train/__init__.py                |  2 +-
 internlm/train/training_internlm.py       | 33 +++++------------------
 internlm/utils/common.py                  |  3 ++-
 internlm/utils/gputest.py                 |  9 +++----
 train.py                                  |  6 +----
 8 files changed, 18 insertions(+), 62 deletions(-)

diff --git a/internlm/initialize/initialize_trainer.py b/internlm/initialize/initialize_trainer.py
index 4827fbcf..91fddebe 100644
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@@ -25,7 +25,7 @@
 from internlm.data.utils import unpack_data
 from internlm.solver.beta2_scheduler import Beta2Scheduler
 from internlm.solver.optimizer.hybrid_zero_optim import BaseOptimizer
-from internlm.utils.common import get_current_device, SchedulerHook
+from internlm.utils.common import SchedulerHook, get_current_device
 
 
 def initialize_trainer(
diff --git a/internlm/model/metrics.py b/internlm/model/metrics.py
index a19bbee0..aebdc13d 100644
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@@ -6,8 +6,8 @@
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.common import SchedulerHook
+from internlm.utils.megatron_timers import megatron_timer as timer
 
 
 class AccPerplex:
diff --git a/internlm/model/modeling_llama.py b/internlm/model/modeling_llama.py
index 2638ce27..5f999c8f 100644
--- a/internlm/model/modeling_llama.py
+++ b/internlm/model/modeling_llama.py
@@ -168,11 +168,6 @@ def __init__(
             sequence_parallel=sequence_parallel,
             **factory_kwargs,
         )
-        # need to assign tp attribute so that internlm know it is tensor parallel module
-        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-            for name in ["wo", "wq", "wk", "wv"]:
-                for param in getattr(self, name).parameters():
-                    setattr(param, IS_TENSOR_PARALLEL, True)
 
     def forward(self, x, seqlen=None, inference_params=None, **kwargs):
         if kwargs.get("indexes", None) is not None:
@@ -594,16 +589,6 @@ def __init__(
                 dtype=dtype,
             )
 
-        for _, param in self.feed_forward.named_parameters():
-            if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                setattr(param, IS_TENSOR_PARALLEL, True)
-        for param in self.attention_norm.parameters():
-            if gpc.config.parallel.sequence_parallel is True:
-                setattr(param, IS_SEQUENCE_PARALLEL, True)
-        for param in self.ffn_norm.parameters():
-            if gpc.config.parallel.sequence_parallel is True:
-                setattr(param, IS_SEQUENCE_PARALLEL, True)
-
         self.dropout2 = nn.Dropout(drop_rate)
         self.use_swiglu = use_swiglu
         self.use_scaled_init = use_scaled_init
@@ -857,9 +842,8 @@ def __init__(
                     normal_(std=embedding_init_std)(param)
                 else:
                     uniform_(std=embedding_init_std)(param)
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
         self.embed_grad_scale = embed_grad_scale
+
         self.layers = nn.ModuleList(
             [
                 PackedFlashLlamaLayer1D(
@@ -901,9 +885,6 @@ def __init__(
                     self.norm = RMSNorm(hidden_size, eps=layer_norm_epsilon)
                 else:
                     self.norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
-                for param in self.norm.parameters():
-                    if gpc.config.parallel.sequence_parallel is True:
-                        setattr(param, IS_SEQUENCE_PARALLEL, True)
 
             self.output = head_cls(
                 in_features=hidden_size,
@@ -920,8 +901,6 @@ def __init__(
                     normal_(std=out_head_init_std)(param)
                 else:
                     uniform_(std=out_head_init_std)(param)
-                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-                    setattr(param, IS_TENSOR_PARALLEL, True)
 
         self.parallel_output = parallel_output
 
diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py
index 90bb5d86..e4f049d7 100644
--- a/internlm/train/__init__.py
+++ b/internlm/train/__init__.py
@@ -1,4 +1,5 @@
 from .training_internlm import (
+    get_scheduler_hooks,
     get_train_data_loader,
     get_validation_data_loader,
     initialize_llm_profile,
@@ -7,7 +8,6 @@
     load_new_batch,
     record_current_batch_training_metrics,
     wrap_FSDP_model,
-    get_scheduler_hooks,
 )
 
 __all__ = [
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 9c0474e8..6120974c 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -69,7 +69,12 @@
 from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
-from internlm.utils.common import DummyProfile, SchedulerHook, get_current_device
+from internlm.utils.common import (
+    DummyProfile,
+    SchedulerHook,
+    get_current_device,
+    launch_time,
+)
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
@@ -509,30 +514,6 @@ def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: Trai
     return batch, train_iter
 
 
-# def initialize_llm_profile(profiling: bool = False, start_time: str = None):
-#     """Initialize and return the profiler context manager instance."""
-
-#     if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-#         llm_profile = torch.profiler.profile
-#         logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
-#     else:
-#         llm_profile = DummyProfile
-
-#     return llm_profile(
-#         activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
-#         schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
-#         on_trace_ready=torch.profiler.tensorboard_trace_handler(
-#             f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
-#             + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
-#             + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
-#             + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
-#         ),
-#         with_stack=True,
-#         with_modules=True,
-#         profile_memory=True,
-#     )
-
-
 def initialize_llm_profile(profiling: bool = False, start_time: str = None):
     """Initialize and return the profiler context manager instance."""
 
@@ -549,7 +530,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
             f"RUN/{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
             + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
             + f"wp{gpc.get_local_rank(ParallelMode.WEIGHT)}_"
-            + f"sp{gpc.get_local_rank(ParallelMode.SEQUENCE)}",
+            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
         ),
         with_stack=True,
         with_modules=True,
diff --git a/internlm/utils/common.py b/internlm/utils/common.py
index 9dc4efea..7ef57278 100644
--- a/internlm/utils/common.py
+++ b/internlm/utils/common.py
@@ -1,14 +1,15 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from abc import ABC, abstractmethod
 import bisect
 import inspect
 import os
 import random
+from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from datetime import datetime
 from typing import Union
+
 import numpy as np
 import torch
 
diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py
index becc9d85..39c7341e 100644
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@@ -273,11 +273,6 @@ def bench_gpu(use_flash_attn=True):
         )
 
 
-"""
-Useful utility functions migrated from deepseped.
-"""
-
-
 def warmup_process_group():
     # Prevent OOM from nccl communication.
     if dist.is_initialized():
@@ -305,6 +300,10 @@ def warmup_process_group():
 
 
 def cuda_memory_analyze(step=0, print_mm_suage=False):
+    """
+    Useful utility functions migrated from deepseped.
+    """
+
     global n_caching_allocator_flushes
     torch.cuda.synchronize()
 
diff --git a/train.py b/train.py
index bd931890..e0e99aff 100644
--- a/train.py
+++ b/train.py
@@ -1,12 +1,10 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from pickle import FALSE
 import socket
 import time
 import traceback
 from functools import partial
-from typing import List
 
 import torch
 import torch.distributed as dist
@@ -17,8 +15,7 @@
 from internlm.core.trainer import TrainState
 from internlm.initialize import initialize_distributed_env
 from internlm.model.loss import FlashGPTLMLoss
-from internlm.model.metrics import AccPerplex, SchedulerMetricHook
-from internlm.core.communication.isp import ISPCommunicatorSchedulerHook
+from internlm.model.metrics import AccPerplex
 from internlm.monitor import initialize_monitor_manager, send_alert_message
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.train import (
@@ -37,7 +34,6 @@
     get_megatron_flops_2,
     launch_time,
     parse_args,
-    SchedulerHook,
 )
 from internlm.utils.evaluation import evaluate_on_val_dls
 from internlm.utils.gputest import empty_cache_and_diag

From b0c6a20101908379baa7b3e10d5d188a908dc5ea Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 24 Jan 2024 14:53:45 +0800
Subject: [PATCH 130/153] feat(format): fix ci lint check error

---
 internlm/core/scheduler/no_pipeline_scheduler.py | 2 +-
 internlm/core/scheduler/pipeline_scheduler.py    | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py
index cb8ff780..0cd8c103 100644
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@@ -11,7 +11,7 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
-from internlm.utils.common import conditional_context, SchedulerHook
+from internlm.utils.common import SchedulerHook, conditional_context
 from internlm.utils.logger import get_logger
 from internlm.utils.timeout import llm_timeout
 
diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py
index d29c54dc..03daca29 100644
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@@ -14,7 +14,12 @@
 from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
 from internlm.core.naive_amp import NaiveAMPModel
-from internlm.utils.common import check_data_is_packed, get_current_device, move_to_device, SchedulerHook
+from internlm.utils.common import (
+    SchedulerHook,
+    check_data_is_packed,
+    get_current_device,
+    move_to_device,
+)
 from internlm.utils.logger import get_logger
 from internlm.utils.timeout import llm_timeout
 

From 571d83c0542fcda802bc90d888f0ed5ce7d504ed Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 24 Jan 2024 14:56:39 +0800
Subject: [PATCH 131/153] feat(format): fix ci lint check error

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index e0e99aff..720a88f9 100644
--- a/train.py
+++ b/train.py
@@ -19,6 +19,7 @@
 from internlm.monitor import initialize_monitor_manager, send_alert_message
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.train import (
+    get_scheduler_hooks,
     get_train_data_loader,
     get_validation_data_loader,
     initialize_llm_profile,
@@ -26,7 +27,6 @@
     initialize_optimizer,
     load_new_batch,
     record_current_batch_training_metrics,
-    get_scheduler_hooks,
 )
 from internlm.utils.common import (
     BatchSkipper,

From 83517ca37e0d7fcc69478ee97617248ac642b164 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 24 Jan 2024 19:30:09 +0800
Subject: [PATCH 132/153] feat(evaluation.py): fix evaluation error when
 msp/fsp with pp

---
 internlm/model/modeling_moe.py                 |  4 ++--
 internlm/solver/optimizer/hybrid_zero_optim.py |  8 --------
 internlm/train/training_internlm.py            |  2 +-
 internlm/train/utils.py                        | 15 ++++++++-------
 internlm/utils/evaluation.py                   | 10 ++++++----
 5 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
index 2db6f727..5b6e85bf 100644
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@@ -373,8 +373,8 @@ def __init__(
 
         checkpoint_layer_num = int(num_layers * checkpoint)
         self.tp_mode = "mtp"
-        if isinstance(gpc.config.parallel.tensor, dict):
-            self.tp_mode = gpc.config.parallel.tensor.get("mode", "mtp")
+        if isinstance(gpc.config.parallel["tensor"], dict):
+            self.tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp")
 
         if is_reward:
             head_cls = RewardModelLinear
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index b5ec92b0..21e93af2 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -164,15 +164,7 @@ def __init__(
             # add the fp16 params to fp16_param_groups for bookkeeping
             self._fp16_param_groups[group_id] = group_params
 
-            # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1
-            # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode
-            # zero_mode = (
-            #     ParallelMode.ZERO1
-            #     if gpc.get_world_size(ParallelMode.ZERO1) == 1 or param_group["dp_mode"] == ParallelMode.DATA
-            #     else ParallelMode.EXPERT_DATA
-            # )
             zero_mode = param_group["optimizer_mode"]
-
             self._zero_local_rank.append(gpc.get_local_rank(zero_mode))
             self._zero_world_size.append(gpc.get_world_size(zero_mode))
             # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 6120974c..c13abfc3 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -344,7 +344,7 @@ def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerH
             ),
         )
 
-    if isp_communicator is not None:
+    if isp_communicator is not None and gpc.config.parallel["weight"].get("overlap", False):
         scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim))
 
     return scheduler_hooks
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 4444b30d..2f57f11a 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -8,7 +8,7 @@
 from internlm.utils.parallel import is_tensor_data_parallel_parameter
 
 
-def split_params_into_different_groups_for_optimizer_with_new_partition_strategy(
+def split_params_into_different_groups_for_optimizer(
     param_groups: Tuple[Dict],
 ) -> Tuple[Dict]:
     """Split parameters into different groups for optimizer
@@ -24,8 +24,9 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         Tuple[Dict]: list of params groups for optimizer
         Output Example:
         >>> (
-        >>>     {'name': 'default','params': [tensor],'weight_decay' :xxx},
-        >>>     {'name': 'embed_head', 'params': [tensor],'weight_decay' :xxx},
+        >>>     {'name': 'default', 'params': [tensor], 'weight_decay' :xxx},
+        >>>     {'name': 'embed_head', 'params': [tensor], 'weight_decay' :xxx},
+        >>>     {'name': 'fp32', 'params': [tensor], 'weight_decay' :xxx},
         >>> )
     """
 
@@ -38,8 +39,9 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
 
     # create new groups for IS_TENSOR_DATA_PARALLEL parameter group
     new_groups = {}
-    if gpc.config.parallel.tensor.mode == "isp":
+    if isinstance(gpc.config.parallel["tensor"], dict) and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
         new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
+    # create new groups for fp32 parameter group
     new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1}
 
     if gpc.config.model.get("num_experts", 1) > 1:
@@ -71,7 +73,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
         pgroup["params"] = origin_params
         pgroup["optimizer_mode"] = ParallelMode.ZERO1
 
-    # param groups may contain empty groups, such as embed_head
+    # param groups may contain empty groups, such as fp32
     param_groups.extend(new_groups.values())
 
     return tuple(param_groups)
@@ -79,5 +81,4 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy
 
 def create_param_groups(model, weight_decay):
     parameters = {"params": list(model.parameters()), "name": "default", "weight_decay": weight_decay}
-    # return split_params_into_different_groups_for_optimizer(parameters)
-    return split_params_into_different_groups_for_optimizer_with_new_partition_strategy(parameters)
+    return split_params_into_different_groups_for_optimizer(parameters)
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index e52586b8..1c1515b4 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -50,10 +50,12 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
 def switch_sequence_parallel_mode():
     prev_mode = gpc.config.parallel.sequence_parallel
     try:
-        if gpc.config.parallel["tensor"]["mode"] == "mtp":
-            gpc.config.parallel.sequence_parallel = False
-        else:
+        # when training x.shape is torch.Size([1024, 4096]), linear all gather in dim=0(sequence dim)
+        # but evaluation x.shape is torch.Size([1, 1024, 4096]), gather in dim=0 is error.
+        if gpc.config.parallel["tensor"]["mode"] == "isp":
             gpc.config.parallel.sequence_parallel = True
+        else:
+            gpc.config.parallel.sequence_parallel = False
         yield
     finally:
         gpc.config.parallel.sequence_parallel = prev_mode
@@ -102,7 +104,7 @@ def evaluate_on_val_dls(
                         total_val_bsz = len(batch[1])
                         assert total_val_bsz % data_cfg.micro_bsz == 0
                         num_microbatches = total_val_bsz // data_cfg.micro_bsz
-                        if gpc.config.parallel["tensor"]["mode"] == "isp":
+                        if gpc.config.parallel.sequence_parallel:
                             sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
                             tensor_shape = torch.Size(
                                 [

From 0ec9b67aa629def980149d1a5a8ac9ff3d9231f1 Mon Sep 17 00:00:00 2001
From: JiaoPL <jplzyc@163.com>
Date: Thu, 25 Jan 2024 14:23:44 +0800
Subject: [PATCH 133/153] fix moe param groups

---
 .../solver/optimizer/hybrid_zero_optim.py     | 36 ++++++++++++++-----
 internlm/solver/optimizer/utils.py            | 18 ++--------
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index d7ffb1c0..44111cb9 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -673,17 +673,21 @@ def _compute_param_norm_stage(
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
-            if group_id == 0:
+            if self.optim.param_groups[group_id]["name"] in ("default", "fp32"):
                 for param in params:
                     if self.use_isp:
                         setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
                     else:
                         setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
-            elif group_id == 1:
+            elif self.optim.param_groups[group_id]["name"] == "embed_head":
+                # should be isp mode
                 for param in params:
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            elif self._is_moe_group(self.optim.param_groups[group_id]):
+                for param in params:
+                    setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True)
             else:
-                raise NotImplementedError("group_id > 1 is not yet implemented.")
+                raise NotImplementedError("unrecognized parameter group.")
 
         if self._clip_grad_norm > 0:
             total_param_norms = compute_param_norm(
@@ -704,6 +708,8 @@ def _compute_param_norm_stage(
                     delattr(param, IS_TENSOR_ZERO_PARALLEL)
                 if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
                     delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL)
 
         return total_param_norms
 
@@ -718,17 +724,21 @@ def _compute_vocab_grad_norm_stage(
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
-            if group_id == 0:
+            if self.optim.param_groups[group_id]["name"] in ("default", "fp32"):
                 for param in params:
                     if self.use_isp:
                         setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
                     else:
                         setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
-            elif group_id == 1:
+            elif self.optim.param_groups[group_id]["name"] == "embed_head":
+                # should be isp mode
                 for param in params:
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            elif self._is_moe_group(self.optim.param_groups[group_id]):
+                for param in params:
+                    setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True)
             else:
-                raise NotImplementedError("group_id > 1 is not yet implemented.")
+                raise NotImplementedError("unrecognized parameter group.")
 
         vocab_grad_norm = None
 
@@ -751,6 +761,8 @@ def _compute_vocab_grad_norm_stage(
                     delattr(param, IS_TENSOR_ZERO_PARALLEL)
                 if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
                     delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL)
 
         return vocab_grad_norm
 
@@ -767,17 +779,21 @@ def _count_zero_grads_stage(
             grads = [self.padding_grad.to(dtype)]
             params = [self.padding_tensor.to(dtype)]
 
-            if group_id == 0:
+            if self.optim.param_groups[group_id]["name"] in ("default", "fp32"):
                 for param in params:
                     if self.use_isp:
                         setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
                     else:
                         setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
-            elif group_id == 1:
+            elif self.optim.param_groups[group_id]["name"] == "embed_head":
+                # should be isp mode
                 for param in params:
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
+            elif self._is_moe_group(self.optim.param_groups[group_id]):
+                for param in params:
+                    setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True)
             else:
-                raise NotImplementedError("group_id > 1 is not yet implemented.")
+                raise NotImplementedError("unrecognized parameter group.")
 
         if self._clip_grad_norm > 0:
             total_zero_grad_count = compute_zero_grad_count(
@@ -798,6 +814,8 @@ def _count_zero_grads_stage(
                     delattr(param, IS_TENSOR_ZERO_PARALLEL)
                 if hasattr(param, IS_WEIGHT_ZERO_PARALLEL):
                     delattr(param, IS_WEIGHT_ZERO_PARALLEL)
+                if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL):
+                    delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL)
 
         return total_zero_grad_count
 
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index d2769474..ff707a42 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -239,13 +239,6 @@ def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False
     if fine_grained:
         parallel_grads = {}
 
-    if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
-        param_parallel_mode = ParallelMode.TENSOR
-    elif gpc.is_using_parallel_mode(weight_parallel_mode):
-        param_parallel_mode = weight_parallel_mode
-    else:
-        param_parallel_mode = ParallelMode.TENSOR
-
     def append_grad(g, p):
         if fine_grained:
             param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding"
@@ -255,7 +248,7 @@ def append_grad(g, p):
         elif only_output:
             param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding"
             if (
-                gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(param_parallel_mode)
+                gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(ParallelMode.TENSOR)
                 and gpc.config.model["hidden_size"] == g.shape[1]
                 and "embedding" not in param_name.lower()
             ):
@@ -451,13 +444,6 @@ def compute_vocab_grad_norm(
     norm_type = float(norm_type)
     vocab_size = gpc.config.model["vocab_size"]
 
-    if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]):
-        param_parallel_mode = ParallelMode.TENSOR
-    elif gpc.is_using_parallel_mode(weight_parallel_mode):
-        param_parallel_mode = weight_parallel_mode
-    else:
-        param_parallel_mode = ParallelMode.TENSOR
-
     param_grads = reduce_grads(gradients, parameters, weight_parallel_mode, only_output=True)
 
     vocab_grad_norm = torch.zeros((vocab_size,), dtype=torch.float32).to(get_current_device())
@@ -465,7 +451,7 @@ def compute_vocab_grad_norm(
         for grad in param_grads:
             # get grad norm of each vocab
             vocab_slice_size = grad.shape[0]
-            local_tp_rank = gpc.get_local_rank(param_parallel_mode)
+            local_tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
             for i in range(vocab_slice_size):
                 cur_vocab_grad_norm = get_norm([grad[i, :]], norm_type, enable_cuda_kernels)[0]
                 vocab_grad_norm[i + vocab_slice_size * local_tp_rank] += get_tensor_norm(

From aa388b54d01edc9046b1e18f8b20b9debdecb72b Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Thu, 25 Jan 2024 14:32:48 +0800
Subject: [PATCH 134/153] modify the distributedAttention for different data
 pack mode

---
 internlm/core/context/parallel_context.py |  1 +
 internlm/model/multi_head_attention.py    | 65 +++++++++++++----------
 internlm/utils/evaluation.py              |  7 ++-
 3 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index fd53c4be..e1bdb601 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -157,6 +157,7 @@ def __init__(self):
         self.virtual_pipeline_parallel_size = None
         self.virtual_pipeline_parallel_rank = None
         self._expert_parallel_group_names = []
+        self.evaluation = False
 
     @property
     def config(self):
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 825e3f21..01a88034 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -80,48 +80,57 @@ def __init__(
         self,
         local_attention: Module,
         sequence_process_group: dist.ProcessGroup,
-        first_scatter_idx: int = 2,
-        first_gather_idx: int = 0,
-        second_scatter_idx: int = 0,
-        second_gather_idx: int = 1,
     ) -> None:
         super().__init__()
         self.local_attn = local_attention
         self.spg = sequence_process_group
-        self.first_scatter_idx = first_scatter_idx
-        self.first_gather_idx = first_gather_idx
-        self.second_scatter_idx = second_scatter_idx
-        self.second_gather_idx = second_gather_idx
+        self._scatter_gather_idx = {}
+        
+        # scatter_gather_idx contains the scatter and gather index for different data packed mode
+        # key is the data packed mode, which should be in ['qkv', 'kv', 'q', 'output']
+        # value is the scatter and gather index in all2all
+        self._scatter_gather_idx['qkv'] = [2, 0] # qkv shape:[sequence, 3, head, head_dim]
+        self._scatter_gather_idx['kv'] = [2, 0] # kv shape: [sequence, 2, head, head_dim]
+        self._scatter_gather_idx['q'] = [1, 0] # q/k/v shape: [sequence, head, head_dim]
+        self._scatter_gather_idx['output'] = [0, 1] # output shape: [sequence, head, head_dim]
+        
+        
+    def forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any) -> Tensor:
+        if gpc.evaluation is True:
+            # when conducting evaluation, the scatter and gather index should add 1.
+            eval_scatter_gather_idx = {key: [x + 1 for x in value] for key, value in self._scatter_gather_idx.items()}
+            self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs)
+        else:
+            self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=self._scatter_gather_idx, **kwargs)
 
-    def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor:
+    def _forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, scatter_gather: dict = None, **kwargs: Any) -> Tensor:
         """forward
 
         Arguments:
-            query (Tensor): query input to the layer
-            key (Tensor): key input to the layer
-            value (Tensor): value input to the layer
+            qkv (Tensor): packed qkv input to the layer
+            kv (Tensor): packed kv input to the layer
+            q (Tensor): q input to the layer
+            k (Tensor): k input to the layer
+            v (Tensor): v input to the layer
             args: other args
 
         Returns:
             * output (Tensor): context output
         """
-        # Evaluation
-        if qkv.ndim == 5:
-            # in shape: [batch, seq/tp_size, 3, head, head_dim]
-            qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx + 1, self.first_gather_idx + 1)
-            # out shape : [batch, seq, head/tp_size, head_dim]
-            context_layer = self.local_attn(qkv, **kwargs)
-            # in shape: [batch, seq, head/tp_size, head_dim]
-            output = _SeqAllToAll.apply(
-                self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1
-            )
-        else:  # training
-            # in shape: [seq/tp_size, 3, head, head_dim]
-            qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx, self.first_gather_idx)
-            # out shape : [seq, head/tp_size, head_dim]
+
+        if qkv is not None:
+            qkv = _SeqAllToAll.apply(self.spg, qkv, scatter_gather['qkv'][0], scatter_gather['qkv'][1])
             context_layer = self.local_attn(qkv, **kwargs)
-            # in shape: [seq, head/tp_size, head_dim]
-            output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx, self.second_gather_idx)
+        elif kv is not None:
+            q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1])
+            kv = _SeqAllToAll.apply(self.spg, kv, scatter_gather['kv'][0], scatter_gather['kv'][1])
+            context_layer = self.local_attn(q, kv, **kwargs)
+        else:
+            q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1])
+            k = _SeqAllToAll.apply(self.spg, k, scatter_gather['q'][0], scatter_gather['q'][1])
+            v = _SeqAllToAll.apply(self.spg, v, scatter_gather['q'][0], scatter_gather['q'][1])
+            context_layer = self.local_attn(q, k, v, **kwargs)
+        output = _SeqAllToAll.apply(self.spg, context_layer, scatter_gather['output'][0], scatter_gather['output'][1])
 
         # out e.g., [s/p::h]
         return output
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index 1c1515b4..1d840ac4 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -47,8 +47,9 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
 
 
 @contextmanager
-def switch_sequence_parallel_mode():
+def switch_evaluation_mode():
     prev_mode = gpc.config.parallel.sequence_parallel
+    prev_evaluation = gpc.evaluation
     try:
         # when training x.shape is torch.Size([1024, 4096]), linear all gather in dim=0(sequence dim)
         # but evaluation x.shape is torch.Size([1, 1024, 4096]), gather in dim=0 is error.
@@ -56,9 +57,11 @@ def switch_sequence_parallel_mode():
             gpc.config.parallel.sequence_parallel = True
         else:
             gpc.config.parallel.sequence_parallel = False
+        gpc.evaluation = True
         yield
     finally:
         gpc.config.parallel.sequence_parallel = prev_mode
+        gpc.evaluation = prev_evaluation
 
 
 def evaluate_on_val_dls(
@@ -70,7 +73,7 @@ def evaluate_on_val_dls(
     update_panel: bool = False,
     streaming: bool = False,
 ):
-    with switch_sequence_parallel_mode():
+    with switch_evaluation_mode():
         torch.cuda.empty_cache()
         trainer.eval()
         verbose = gpc.is_rank_for_log()

From 34b94790b08ca8e1260a398366cf44bfbb891318 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 25 Jan 2024 14:48:54 +0800
Subject: [PATCH 135/153] feat(model/multi_head_attention.py): fix return
 output

---
 internlm/model/multi_head_attention.py | 46 ++++++++++++++++----------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 01a88034..200d4a9f 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -85,25 +85,35 @@ def __init__(
         self.local_attn = local_attention
         self.spg = sequence_process_group
         self._scatter_gather_idx = {}
-        
+
         # scatter_gather_idx contains the scatter and gather index for different data packed mode
         # key is the data packed mode, which should be in ['qkv', 'kv', 'q', 'output']
         # value is the scatter and gather index in all2all
-        self._scatter_gather_idx['qkv'] = [2, 0] # qkv shape:[sequence, 3, head, head_dim]
-        self._scatter_gather_idx['kv'] = [2, 0] # kv shape: [sequence, 2, head, head_dim]
-        self._scatter_gather_idx['q'] = [1, 0] # q/k/v shape: [sequence, head, head_dim]
-        self._scatter_gather_idx['output'] = [0, 1] # output shape: [sequence, head, head_dim]
-        
-        
-    def forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any) -> Tensor:
+        self._scatter_gather_idx["qkv"] = [2, 0]  # qkv shape:[sequence, 3, head, head_dim]
+        self._scatter_gather_idx["kv"] = [2, 0]  # kv shape: [sequence, 2, head, head_dim]
+        self._scatter_gather_idx["q"] = [1, 0]  # q/k/v shape: [sequence, head, head_dim]
+        self._scatter_gather_idx["output"] = [0, 1]  # output shape: [sequence, head, head_dim]
+
+    def forward(
+        self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any
+    ) -> Tensor:
         if gpc.evaluation is True:
             # when conducting evaluation, the scatter and gather index should add 1.
             eval_scatter_gather_idx = {key: [x + 1 for x in value] for key, value in self._scatter_gather_idx.items()}
-            self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs)
+            return self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs)
         else:
-            self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=self._scatter_gather_idx, **kwargs)
+            return self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=self._scatter_gather_idx, **kwargs)
 
-    def _forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, scatter_gather: dict = None, **kwargs: Any) -> Tensor:
+    def _forward(
+        self,
+        qkv: Tensor = None,
+        kv: Tensor = None,
+        q: Tensor = None,
+        k: Tensor = None,
+        v: Tensor = None,
+        scatter_gather: dict = None,
+        **kwargs: Any,
+    ) -> Tensor:
         """forward
 
         Arguments:
@@ -119,18 +129,18 @@ def _forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: T
         """
 
         if qkv is not None:
-            qkv = _SeqAllToAll.apply(self.spg, qkv, scatter_gather['qkv'][0], scatter_gather['qkv'][1])
+            qkv = _SeqAllToAll.apply(self.spg, qkv, scatter_gather["qkv"][0], scatter_gather["qkv"][1])
             context_layer = self.local_attn(qkv, **kwargs)
         elif kv is not None:
-            q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1])
-            kv = _SeqAllToAll.apply(self.spg, kv, scatter_gather['kv'][0], scatter_gather['kv'][1])
+            q = _SeqAllToAll.apply(self.spg, q, scatter_gather["q"][0], scatter_gather["q"][1])
+            kv = _SeqAllToAll.apply(self.spg, kv, scatter_gather["kv"][0], scatter_gather["kv"][1])
             context_layer = self.local_attn(q, kv, **kwargs)
         else:
-            q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1])
-            k = _SeqAllToAll.apply(self.spg, k, scatter_gather['q'][0], scatter_gather['q'][1])
-            v = _SeqAllToAll.apply(self.spg, v, scatter_gather['q'][0], scatter_gather['q'][1])
+            q = _SeqAllToAll.apply(self.spg, q, scatter_gather["q"][0], scatter_gather["q"][1])
+            k = _SeqAllToAll.apply(self.spg, k, scatter_gather["q"][0], scatter_gather["q"][1])
+            v = _SeqAllToAll.apply(self.spg, v, scatter_gather["q"][0], scatter_gather["q"][1])
             context_layer = self.local_attn(q, k, v, **kwargs)
-        output = _SeqAllToAll.apply(self.spg, context_layer, scatter_gather['output'][0], scatter_gather['output'][1])
+        output = _SeqAllToAll.apply(self.spg, context_layer, scatter_gather["output"][0], scatter_gather["output"][1])
 
         # out e.g., [s/p::h]
         return output

From 10309b8b8b358d6c77320d4e9fe603d41ea8e66b Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 25 Jan 2024 15:46:49 +0800
Subject: [PATCH 136/153] feat(utils/evaluation.py): rename gpc.evaluation to
 gpc.is_evaluating

---
 internlm/core/context/parallel_context.py |  2 +-
 internlm/utils/evaluation.py              | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index e1bdb601..d597575c 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -157,7 +157,7 @@ def __init__(self):
         self.virtual_pipeline_parallel_size = None
         self.virtual_pipeline_parallel_rank = None
         self._expert_parallel_group_names = []
-        self.evaluation = False
+        self.is_evaluating = False
 
     @property
     def config(self):
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index 1d840ac4..a10cf243 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -48,20 +48,22 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
 
 @contextmanager
 def switch_evaluation_mode():
-    prev_mode = gpc.config.parallel.sequence_parallel
-    prev_evaluation = gpc.evaluation
+    prev_seq = gpc.config.parallel.sequence_parallel
+    prev_eval = gpc.is_evaluating
     try:
+        gpc.is_evaluating = True
+
         # when training x.shape is torch.Size([1024, 4096]), linear all gather in dim=0(sequence dim)
         # but evaluation x.shape is torch.Size([1, 1024, 4096]), gather in dim=0 is error.
         if gpc.config.parallel["tensor"]["mode"] == "isp":
             gpc.config.parallel.sequence_parallel = True
         else:
             gpc.config.parallel.sequence_parallel = False
-        gpc.evaluation = True
+
         yield
     finally:
-        gpc.config.parallel.sequence_parallel = prev_mode
-        gpc.evaluation = prev_evaluation
+        gpc.config.parallel.sequence_parallel = prev_seq
+        gpc.is_evaluating = prev_eval
 
 
 def evaluate_on_val_dls(

From 4c8324a5b95a517d90505926289b91677c208273 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 25 Jan 2024 15:55:39 +0800
Subject: [PATCH 137/153] feat(multi_head_attention.py): rename gpc.evaluation
 to gpc.is_evaluating

---
 internlm/model/multi_head_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index 200d4a9f..2d3a5959 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -97,7 +97,7 @@ def __init__(
     def forward(
         self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any
     ) -> Tensor:
-        if gpc.evaluation is True:
+        if gpc.is_evaluating is True:
             # when conducting evaluation, the scatter and gather index should add 1.
             eval_scatter_gather_idx = {key: [x + 1 for x in value] for key, value in self._scatter_gather_idx.items()}
             return self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs)

From f186a7548ae9e2919d0c620331dc67c336ef43cb Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Thu, 25 Jan 2024 18:32:46 +0800
Subject: [PATCH 138/153] feat(communication/isp.py): refactor isp communicator
 to adapt to different model structures

---
 internlm/core/communication/isp.py  | 46 +++++++++++------------------
 internlm/train/training_internlm.py |  3 --
 2 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index d0bbe2ed..fd917b77 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -24,11 +24,9 @@ class ISPCommModelConfig:
     model config for isp communicator.
     """
 
-    hidden_size: int = 0
-    mlp_ratio: float = 0
     dtype: torch.dtype = torch.half
     device: torch.device = torch.device("cuda")
-    modules: List[str] = None
+    module_shapes: Dict[str, torch.Size] = None
 
 
 class MemoryPool:
@@ -41,11 +39,9 @@ def __init__(
         model_conf: ISPCommModelConfig,
         with_bias: bool = False,
     ) -> None:
-        self._hidden_size = model_conf.hidden_size
-        self._mlp_ratio = model_conf.mlp_ratio
         self._dtype = model_conf.dtype
         self._device = model_conf.device
-        self._module_shapes = self._init_module_shape(model_conf.modules)
+        self._module_shapes = model_conf.module_shapes
 
         # due to intern sequence parallel communication overlap, we need
         # **two** memory pools for current block weights and the next block weights.
@@ -75,21 +71,6 @@ def __init__(
         # memory pool for constant zero tensors, allocated lazily.
         self._zero_const_pool = {}
 
-    def _init_module_shape(self, modules: List[str]) -> Dict[str, torch.Size]:
-        mlp_hidden_size = 256 * ((int(self._hidden_size * self._mlp_ratio) + 256 - 1) // 256)
-
-        # TODO: the memory pool should be more generic.
-        # Currently, it only supports llama-class models with specific naming structure.
-        static_shapes = {
-            "Wqkv": torch.Size((3 * self._hidden_size, self._hidden_size)),
-            "out_proj": torch.Size((self._hidden_size, self._hidden_size)),
-            "w1": torch.Size((mlp_hidden_size, self._hidden_size)),
-            "w2": torch.Size((mlp_hidden_size, self._hidden_size)),
-            "w3": torch.Size((self._hidden_size, mlp_hidden_size)),
-        }
-
-        return {name: static_shapes[name] for name in modules}
-
     def allocate_constant_zero(self, size: tuple) -> torch.Tensor:
         if size not in self._zero_const_pool:
             self._zero_const_pool[size] = torch.zeros(*size, dtype=self._dtype, device=self._device).contiguous()
@@ -180,9 +161,9 @@ def __init__(
         self.overlap = overlap
         self.enable_memory_pool = overlap and enable_memory_pool
         self.model_conf = model_conf
-        self.module_name = model_conf.modules.copy()
         self.is_forward = True
         self.reduce_scatter_handlers = {}
+        self._module_shapes = {}
 
         # real overlap state for each chunk.
         self._overlap_states: Dict[int, ISPOverlapState] = {}
@@ -207,12 +188,6 @@ def __init__(
         # key: transformer block index; value: isp modules
         self._index_to_isp_module = None
 
-        # init memory pool if necessary.
-        if self.enable_memory_pool:
-            self.memory_pool = MemoryPool(model_conf, with_bias=True)
-        else:
-            self.memory_pool = None
-
         # init overlap states if necessary.
         if self.overlap:
             # just want to share same for loop for modulelist and module.
@@ -228,6 +203,13 @@ def __init__(
                 self._register_sync_parameters_hook()
             # switch to chunk 0 at first.
             self.switch_current_model_chunk(0)
+            self.model_conf.module_shapes = self._module_shapes
+
+            # init memory pool if necessary.
+            if self.enable_memory_pool:
+                self.memory_pool = MemoryPool(self.model_conf, with_bias=True)
+            else:
+                self.memory_pool = None
 
     def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
         self._overlap_states[cid] = ISPOverlapState()
@@ -246,10 +228,16 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
                     self._overlap_states[cid].index_to_isp_module[idx] = []
                     for sub_name, sub in block.named_children():
                         for name, child in sub.named_children():
-                            if name == "out_proj":
+                            if name in ["out_proj", "wo"]:
                                 self._overlap_states[cid].isp_outs.append(child)
                                 self._overlap_states[cid].module_to_index[child] = idx
                             if isinstance(child, ISPLinear):
+                                if name not in self._module_shapes:
+                                    origin_shape = tuple(
+                                        [child.weight.shape[0] * gpc.weight_parallel_size]
+                                        + list(child.weight.shape[1:])
+                                    )
+                                    self._module_shapes[name] = torch.Size(origin_shape)
                                 self._overlap_states[cid].module_to_index[child] = idx
                                 self._overlap_states[cid].isp_modules.append(child)
                                 self._overlap_states[cid].index_to_isp_module[idx].append(child)
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index c13abfc3..b43cfcb3 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -222,11 +222,8 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f
         isp_communicator = ISPCommunicator(
             model,
             ISPCommModelConfig(
-                gpc.config.model.hidden_size,
-                gpc.config.model.mlp_ratio,
                 gpc.config.model.dtype,
                 get_current_device(),
-                ["Wqkv", "out_proj", "w1", "w2", "w3"],
             ),
             gpc.config.parallel.weight.overlap,
             gpc.config.model.checkpoint,

From 3d7402d59ddc3eeb451fadb8cab751c87a953436 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 26 Jan 2024 11:56:47 +0800
Subject: [PATCH 139/153] fix(tests): fix ci test error

---
 internlm/train/__init__.py                 | 4 ++++
 tests/test_core/utils.py                   | 4 +++-
 tests/test_training/test_load_ckpt_loss.py | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py
index e4f049d7..9a70e1e2 100644
--- a/internlm/train/__init__.py
+++ b/internlm/train/__init__.py
@@ -7,6 +7,8 @@
     initialize_optimizer,
     load_new_batch,
     record_current_batch_training_metrics,
+    set_fp32_attr_for_model,
+    set_parallel_attr_for_param_groups,
     wrap_FSDP_model,
 )
 
@@ -20,4 +22,6 @@
     "record_current_batch_training_metrics",
     "wrap_FSDP_model",
     "get_scheduler_hooks",
+    "set_parallel_attr_for_param_groups",
+    "set_fp32_attr_for_model",
 ]
diff --git a/tests/test_core/utils.py b/tests/test_core/utils.py
index 3d25667f..f7d562e2 100644
--- a/tests/test_core/utils.py
+++ b/tests/test_core/utils.py
@@ -13,7 +13,7 @@
 from internlm.core.scheduler import InterleavedPipelineScheduler, NonPipelineScheduler, PipelineScheduler
 from internlm.model.metrics import SchedulerMetricHook
 from internlm.solver.pipeline_utils import partition_uniform
-from internlm.train import initialize_optimizer
+from internlm.train import initialize_optimizer, set_parallel_attr_for_param_groups
 
 
 class MlpModel(nn.Module):
@@ -67,6 +67,8 @@ def init_model_and_optim(
     pp_model = _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, embedding=embedding)
     pp_model = pp_model.to(dtype)
 
+    set_parallel_attr_for_param_groups(pp_model)
+
     # pp scheduler
     scheduler_hooks = [
         SchedulerMetricHook(skip=True),
diff --git a/tests/test_training/test_load_ckpt_loss.py b/tests/test_training/test_load_ckpt_loss.py
index a5156870..01a11299 100644
--- a/tests/test_training/test_load_ckpt_loss.py
+++ b/tests/test_training/test_load_ckpt_loss.py
@@ -12,7 +12,7 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.context.parallel_context import Config
-from internlm.core.scheduler import SchedulerMetricHook
+from internlm.model.metrics import SchedulerMetricHook
 from internlm.core.trainer import TrainState
 from internlm.initialize.launch import args_sanity_check
 from internlm.model.loss import FlashGPTLMLoss

From 8170641936ecb791179f33c34d2a5a50b1cfc71a Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 26 Jan 2024 15:28:05 +0800
Subject: [PATCH 140/153] fix(tests): fix ci pipeline test error

---
 tests/test_core/test_pipeline.py          | 2 +-
 tests/test_core/utils.py                  | 4 +---
 tests/test_utils/test_model_checkpoint.py | 2 ++
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_core/test_pipeline.py b/tests/test_core/test_pipeline.py
index db7b3ddd..f5b4ebb3 100644
--- a/tests/test_core/test_pipeline.py
+++ b/tests/test_core/test_pipeline.py
@@ -118,7 +118,7 @@ def exam_pipeline_parallel(args):
         )
         output_list.append(output)
 
-    engine.step()
+    # engine.step()
 
     # torch related
     if gpc.is_last_rank(ParallelMode.PIPELINE):
diff --git a/tests/test_core/utils.py b/tests/test_core/utils.py
index f7d562e2..3d25667f 100644
--- a/tests/test_core/utils.py
+++ b/tests/test_core/utils.py
@@ -13,7 +13,7 @@
 from internlm.core.scheduler import InterleavedPipelineScheduler, NonPipelineScheduler, PipelineScheduler
 from internlm.model.metrics import SchedulerMetricHook
 from internlm.solver.pipeline_utils import partition_uniform
-from internlm.train import initialize_optimizer, set_parallel_attr_for_param_groups
+from internlm.train import initialize_optimizer
 
 
 class MlpModel(nn.Module):
@@ -67,8 +67,6 @@ def init_model_and_optim(
     pp_model = _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, embedding=embedding)
     pp_model = pp_model.to(dtype)
 
-    set_parallel_attr_for_param_groups(pp_model)
-
     # pp scheduler
     scheduler_hooks = [
         SchedulerMetricHook(skip=True),
diff --git a/tests/test_utils/test_model_checkpoint.py b/tests/test_utils/test_model_checkpoint.py
index 2063591c..c649d251 100644
--- a/tests/test_utils/test_model_checkpoint.py
+++ b/tests/test_utils/test_model_checkpoint.py
@@ -16,6 +16,8 @@
     LOCAL_SAVE_PATH,
     del_tmp_file,
     init_config,
+    init_dist_and_model,
+    reset_singletons,
 )
 
 # (TOTAL_STEP, CKPT_EVERY, SNPASHOT_EVERY)

From 85dd51fb73173369f3715b8b615cf1fc6b14a042 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Fri, 26 Jan 2024 17:55:09 +0800
Subject: [PATCH 141/153] feat(utils/common.py): remove func
 get_megatron_flops_2

---
 internlm/train/training_internlm.py |  3 ---
 internlm/utils/common.py            | 37 -----------------------------
 internlm/utils/gputest.py           | 17 +++++--------
 train.py                            | 14 -----------
 4 files changed, 6 insertions(+), 65 deletions(-)

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 60f9f821..2ca66be5 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -538,7 +538,6 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
 @llm_timeout(func_name="record_current_batch_training_metrics")
 def record_current_batch_training_metrics(
     get_tflops_func,
-    get_tflops_func_2,
     logger,
     writer,
     success_update,
@@ -623,7 +622,6 @@ def record_current_batch_training_metrics(
         tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2)
 
         tflops = get_tflops_func((time.time() - start_time))
-        tflops_2 = get_tflops_func_2((time.time() - start_time))
 
         tgs_origin = round(
             num_tokens_in_batch
@@ -635,7 +633,6 @@ def record_current_batch_training_metrics(
 
         infos = {
             "tflops": tflops,
-            "tflops2": tflops_2,
             "step": batch_count,
             "loss": loss.item() - moe_loss.item() if moe_loss is not None else loss.item(),
             "tgs (tokens/gpu/second)": tgs_origin,
diff --git a/internlm/utils/common.py b/internlm/utils/common.py
index 7ef57278..39e2d902 100644
--- a/internlm/utils/common.py
+++ b/internlm/utils/common.py
@@ -232,43 +232,6 @@ def get_megatron_flops(
     return tflops
 
 
-def get_megatron_flops_2(
-    elapsed_time_per_iter,
-    checkpoint=False,
-    seq_len=2048,
-    hidden_size=12,
-    num_layers=32,
-    vocab_size=12,
-    global_batch_size=4,
-    global_world_size=1,
-    mlp_ratio=4,
-    use_swiglu=True,
-):
-    """
-    Calc flops based on the paper of Megatron https://deepakn94.github.io/assets/papers/megatron-sc21.pdf
-    """
-
-    checkpoint_activations_factor = 4 if checkpoint else 3
-    flashattn_activations_factor = 4.5 if checkpoint else 3.5
-
-    if use_swiglu:
-        mlp_ratio = mlp_ratio * 3 / 2
-
-    flops_per_iteration = (
-        checkpoint_activations_factor
-        * (8 + mlp_ratio * 4)
-        * global_batch_size
-        * seq_len
-        * hidden_size**2
-        * num_layers
-        + 4 * global_batch_size * seq_len**2 * hidden_size * num_layers * flashattn_activations_factor
-        + 6 * global_batch_size * seq_len * hidden_size * vocab_size
-    )
-
-    tflops = flops_per_iteration / (elapsed_time_per_iter * global_world_size * (10**12))
-    return tflops
-
-
 class DummyProfile:
     """
     Dummy Profile.
diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py
index 39c7341e..9224118a 100644
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@@ -43,17 +43,12 @@ def empty_cache_and_diag(batch_count, interval=50):
         if batch_count > 0:
             if gpc.is_rank_for_log():
                 logger.info("Empty Cache and Diagnosis GPU/NCCL/Timer ...")
-            # with torch.no_grad():
-            #     try:
-            #         timer_diagnosis()
-            #         bench_gpu()
-            #         bench_net()
-            #     except torch.distributed.DistBackendError as e:
-            #         # import time
-            #         # time.sleep(10)
-            #         print(e, "rank = ", gpc.get_global_rank(), flush=True)
-            #         torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
-
+            with torch.no_grad():
+                timer_diagnosis()
+                bench_gpu()
+                # FIXME: Runtime benchmark diagnosis can easily cause the training process
+                # to exit due to NCCL errors.
+                # bench_net()
         # do empty_cache after the bench
         torch.cuda.empty_cache()
         # do garbage collection
diff --git a/train.py b/train.py
index 720a88f9..9620268d 100644
--- a/train.py
+++ b/train.py
@@ -31,7 +31,6 @@
 from internlm.utils.common import (
     BatchSkipper,
     get_megatron_flops,
-    get_megatron_flops_2,
     launch_time,
     parse_args,
 )
@@ -87,18 +86,6 @@ def main(args):
         mlp_ratio=gpc.config.model["mlp_ratio"],
     )
 
-    get_tflops_func_2 = partial(
-        get_megatron_flops_2,
-        checkpoint=gpc.config.model.checkpoint,
-        seq_len=gpc.config.SEQ_LEN,
-        hidden_size=gpc.config.model.hidden_size,
-        num_layers=gpc.config.model.num_layers,
-        vocab_size=gpc.config.model.vocab_size,
-        global_batch_size=gpc.config.data.micro_bsz * gpc.config.data.micro_num * gpc.get_world_size(ParallelMode.DATA),
-        global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
-        mlp_ratio=gpc.config.MLP_RATIO,
-    )
-
     # get and broadcast current time
     current_time = launch_time()
     objs = [current_time]
@@ -265,7 +252,6 @@ def main(args):
             # calculate and record the training metrics, eg. loss, accuracy and so on.
             record_current_batch_training_metrics(
                 get_tflops_func=get_tflops_func,
-                get_tflops_func_2=get_tflops_func_2,
                 logger=logger,
                 writer=writer,
                 success_update=success_update,

From 971c8eb7f19a4b7378d9b72c314732031212289a Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 29 Jan 2024 15:22:46 +0800
Subject: [PATCH 142/153] feat(communication/isp.py): isp communicator support
 0.x activation ckpt

---
 internlm/core/communication/isp.py  | 46 ++++++++++++++++-------------
 internlm/train/training_internlm.py |  2 +-
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index fd917b77..47e37842 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -26,6 +26,7 @@ class ISPCommModelConfig:
 
     dtype: torch.dtype = torch.half
     device: torch.device = torch.device("cuda")
+    activation_checkpointing: float = 0.0
     module_shapes: Dict[str, torch.Size] = None
 
 
@@ -131,7 +132,8 @@ def __init__(self) -> None:
         self.num_blocks: int = 0
         self.embedding: List[nn.Module] = []
         self.head: List[nn.Module] = []
-        self.last_block: nn.Moudle = None
+        self.ckpt_block_num: int = 0
+        self.last_ckpt_block: nn.Module = None
         self.isp_outs: List[nn.Module] = []
         self.isp_modules: List[nn.Module] = []
         self.index_to_isp_module: Dict[int, nn.Module] = {}
@@ -152,12 +154,10 @@ def __init__(
         model: Union[nn.Module, nn.ModuleList],
         model_conf: ISPCommModelConfig,
         overlap: bool = False,
-        activation_checkpointing: bool = False,
         enable_memory_pool: bool = False,
         process_group: dist.ProcessGroup = None,
     ) -> None:
         self.process_group = process_group
-        self.model_checkpoint = activation_checkpointing
         self.overlap = overlap
         self.enable_memory_pool = overlap and enable_memory_pool
         self.model_conf = model_conf
@@ -172,7 +172,8 @@ def __init__(
         self._num_blocks = None
         self._head = None
         self._embedding = None
-        self._last_block = None
+        self._ckpt_block_num = None
+        self._last_ckpt_block = None
         self._isp_outs = None
         self._isp_modules = None
         # key: isp module; value: module global all-gather op handle
@@ -222,7 +223,10 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
             elif isinstance(children, Embedding1D):
                 self._overlap_states[cid].embedding.append(children)
             elif isinstance(children, nn.ModuleList):
-                self._overlap_states[cid].last_block = children[-1]
+                self._overlap_states[cid].ckpt_block_num = int(self.model_conf.activation_checkpointing * len(children))
+                self._overlap_states[cid].last_ckpt_block = children[
+                    max(0, self._overlap_states[cid].ckpt_block_num - 1)
+                ]
 
                 for idx, block in enumerate(children):
                     self._overlap_states[cid].index_to_isp_module[idx] = []
@@ -335,7 +339,7 @@ def _post_forward_hook_for_embedding(self, *args):  # pylint: disable=W0613
     def _pre_forward_hook_for_out_proj(self, module: nn.Module, *args):  # pylint: disable=W0613
         block_index = self._module_to_index[module]
 
-        if self.model_checkpoint and self.is_forward is False:
+        if (block_index - 1 < self._ckpt_block_num) and self.is_forward is False:
             if block_index - 1 >= 0:
                 self._all_gather_block_weight(block_index - 1)
         else:
@@ -350,13 +354,13 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: dis
         self._wait_handle(module)
 
     def _pre_forward_hook_for_block(self, *args):  # pylint: disable=W0613
-        for module in self._index_to_isp_module[self._num_blocks - 1]:
+        for module in self._index_to_isp_module[self._ckpt_block_num - 1]:
             self._all_gather_module_weight(module)
-            self._wait_handle(module)
+            # self._wait_handle(module)
 
     def _post_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
         self._clear_handle(module)
-        if not (self.model_checkpoint and self.is_forward is False):
+        if not ((self._module_to_index[module] < self._ckpt_block_num) and self.is_forward is False):
             self._clear_weight(module)
 
     def _post_backward_hook_for_head(self, *args):  # pylint: disable=W0613
@@ -377,7 +381,8 @@ def _pre_backward_hook_for_module(self, module: nn.Module, *args):  # pylint: di
         module_index = self._isp_modules.index(module)
         if module_index - 1 >= 0:
             next_module = self._isp_modules[module_index - 1]
-            self._all_gather_module_weight(next_module)
+            if self._module_to_index[next_module] >= self._ckpt_block_num:
+                self._all_gather_module_weight(next_module)
 
     def _post_backward_hook_for_module(self, module, *args):  # pylint: disable=W0613
         self._clear_handle(module)
@@ -396,12 +401,12 @@ def _register_sync_parameters_hook(self) -> None:
         for embedding in self._embedding:
             embedding.register_forward_hook(self._post_forward_hook_for_embedding)
 
-        if self.model_checkpoint:
-            if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE):
-                for head in self._head:
-                    head.register_full_backward_pre_hook(self._pre_backward_hook_for_head)
-            else:
-                self._last_block.register_forward_pre_hook(self._pre_forward_hook_for_block)
+        if self._ckpt_block_num >= 1:
+            # if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE):
+            #     for head in self._head:
+            #         head.register_full_backward_pre_hook(self._pre_backward_hook_for_head)
+            # else:
+            self._last_ckpt_block.register_forward_pre_hook(self._pre_forward_hook_for_block)
 
         for out_proj in self._isp_outs:
             out_proj.register_forward_pre_hook(self._pre_forward_hook_for_out_proj)
@@ -414,7 +419,7 @@ def _register_sync_parameters_hook(self) -> None:
         # 1. register post_backward_hook @head module to prefetch for the last block's last module
         # 2. register pre_backward_hook @isp_module to wait handle for current module and to prefetch for next module
         # 3. register post_backward_hook @isp_module to release resource
-        if not self.model_checkpoint:
+        if self._ckpt_block_num < self._num_blocks:
             for head in self._head:
                 head.register_full_backward_hook(self._post_backward_hook_for_head)
 
@@ -443,7 +448,8 @@ def switch_current_model_chunk(self, chunk_id: int) -> None:
         self._bias_global_output = self._overlap_states[chunk_id].bias_global_output
         self._module_to_index = self._overlap_states[chunk_id].module_to_index
         self._index_to_isp_module = self._overlap_states[chunk_id].index_to_isp_module
-        self._last_block = self._overlap_states[chunk_id].last_block
+        self._ckpt_block_num = self._overlap_states[chunk_id].ckpt_block_num
+        self._last_ckpt_block = self._overlap_states[chunk_id].last_ckpt_block
         self._head = self._overlap_states[chunk_id].head
         self._embedding = self._overlap_states[chunk_id].embedding
         self._num_blocks = self._overlap_states[chunk_id].num_blocks
@@ -514,7 +520,7 @@ def __init__(self, overlap_handler: ISPCommunicator, zero_optim) -> None:
         self._zero_optim = zero_optim
 
     def before_forward(self, scheduler, inputs) -> None:
-        if self._isp_communicator.model_checkpoint:
+        if self._isp_communicator._ckpt_block_num > 0:
             self._isp_communicator.is_forward = True
         # switch model chunk before forward
         chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank
@@ -530,7 +536,7 @@ def after_criterion(self, scheduler, loss) -> None:
         pass
 
     def before_backward(self, scheduler, outputs, outputs_grad) -> None:
-        if self._isp_communicator.model_checkpoint:
+        if self._isp_communicator._ckpt_block_num > 0:
             self._isp_communicator.is_forward = False
         # switch model chunk before backward
         chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2ca66be5..7924da69 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -224,9 +224,9 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f
             ISPCommModelConfig(
                 gpc.config.model.dtype,
                 get_current_device(),
+                gpc.config.model.checkpoint,
             ),
             gpc.config.parallel.weight.overlap,
-            gpc.config.model.checkpoint,
             gpc.config.parallel.weight.memory_pool,
             gpc.get_group(ParallelMode.WEIGHT),
         )

From 6853babe1e1bf8eaf93cd5a11c8d75ce92cb464c Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 29 Jan 2024 15:55:58 +0800
Subject: [PATCH 143/153] feat(train/training_internlm.py): move isp init to
 func initialize_isp_communicator

---
 internlm/train/__init__.py          |  2 ++
 internlm/train/training_internlm.py | 49 ++++++++++++++++++-----------
 train.py                            |  6 +++-
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py
index 9a70e1e2..d44eaec9 100644
--- a/internlm/train/__init__.py
+++ b/internlm/train/__init__.py
@@ -2,6 +2,7 @@
     get_scheduler_hooks,
     get_train_data_loader,
     get_validation_data_loader,
+    initialize_isp_communicator,
     initialize_llm_profile,
     initialize_model,
     initialize_optimizer,
@@ -17,6 +18,7 @@
     "get_validation_data_loader",
     "initialize_llm_profile",
     "initialize_model",
+    "initialize_isp_communicator",
     "initialize_optimizer",
     "load_new_batch",
     "record_current_batch_training_metrics",
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 7924da69..62a9d060 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -216,24 +216,7 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f
     # if fsdp enabled, wrap the model
     model = wrap_FSDP_model(model)
 
-    if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp":
-        isp_communicator = None
-    else:
-        isp_communicator = ISPCommunicator(
-            model,
-            ISPCommModelConfig(
-                gpc.config.model.dtype,
-                get_current_device(),
-                gpc.config.model.checkpoint,
-            ),
-            gpc.config.parallel.weight.overlap,
-            gpc.config.parallel.weight.memory_pool,
-            gpc.get_group(ParallelMode.WEIGHT),
-        )
-        # register communicator for isp linear.
-        ISPLinear.register_communicator(isp_communicator)
-
-    return model, isp_communicator
+    return model
 
 
 def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
@@ -269,6 +252,36 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
     return model
 
 
+def initialize_isp_communicator(model: Union[nn.Module, nn.ModuleList]):
+    """
+    Initialize communicator for isp tensor parallel mode.
+
+    Args:
+        model (:class:`torch.nn.Module`): Your model instance to be trained or evaluated.
+
+    Returns:
+        An isp communicator for managing comp/comm overlap and memory pool.
+    """
+    if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp":
+        isp_communicator = None
+    else:
+        isp_communicator = ISPCommunicator(
+            model,
+            ISPCommModelConfig(
+                gpc.config.model.dtype,
+                get_current_device(),
+                gpc.config.model.checkpoint,
+            ),
+            gpc.config.parallel.weight.overlap,
+            gpc.config.parallel.weight.memory_pool,
+            gpc.get_group(ParallelMode.WEIGHT),
+        )
+        # register communicator for isp linear.
+        ISPLinear.register_communicator(isp_communicator)
+
+    return isp_communicator
+
+
 @llm_timeout(func_name="initialize_optimizer")
 def initialize_optimizer(model: Union[nn.Module, nn.ModuleList], isp_communicator: ISPCommunicator = None):
     """
diff --git a/train.py b/train.py
index 9620268d..150f5463 100644
--- a/train.py
+++ b/train.py
@@ -22,6 +22,7 @@
     get_scheduler_hooks,
     get_train_data_loader,
     get_validation_data_loader,
+    initialize_isp_communicator,
     initialize_llm_profile,
     initialize_model,
     initialize_optimizer,
@@ -96,7 +97,10 @@ def main(args):
     uniscale_logger = initialize_llm_logger(start_time=current_time)
 
     # initialize model
-    model, isp_communicator = initialize_model()
+    model = initialize_model()
+
+    # initialize isp communicator
+    isp_communicator = initialize_isp_communicator(model)
 
     with open(args.config, "r") as f:
         config_lines = f.readlines()

From 8c45118c1c8d6217f759f61743e4475204819101 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 29 Jan 2024 16:07:28 +0800
Subject: [PATCH 144/153] feat(communication/isp.py): fix prefetch last ckpt
 block wait handle

---
 internlm/core/communication/isp.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py
index 47e37842..bd744d10 100644
--- a/internlm/core/communication/isp.py
+++ b/internlm/core/communication/isp.py
@@ -9,7 +9,6 @@
 from torch import distributed as dist
 from torch import nn
 
-from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.model.embedding import Embedding1D
@@ -356,7 +355,6 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: dis
     def _pre_forward_hook_for_block(self, *args):  # pylint: disable=W0613
         for module in self._index_to_isp_module[self._ckpt_block_num - 1]:
             self._all_gather_module_weight(module)
-            # self._wait_handle(module)
 
     def _post_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
         self._clear_handle(module)
@@ -402,10 +400,6 @@ def _register_sync_parameters_hook(self) -> None:
             embedding.register_forward_hook(self._post_forward_hook_for_embedding)
 
         if self._ckpt_block_num >= 1:
-            # if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE):
-            #     for head in self._head:
-            #         head.register_full_backward_pre_hook(self._pre_backward_hook_for_head)
-            # else:
             self._last_ckpt_block.register_forward_pre_hook(self._pre_forward_hook_for_block)
 
         for out_proj in self._isp_outs:

From 011edcf27e3a5a020d9295e914e0542a8047debf Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 29 Jan 2024 16:50:33 +0800
Subject: [PATCH 145/153] feat(utils/parallel.py): add func is_using_isp

---
 .../solver/optimizer/hybrid_zero_optim.py     |  6 ++----
 internlm/solver/optimizer/utils.py            |  5 ++---
 internlm/train/training_internlm.py           | 20 ++++++++-----------
 internlm/train/utils.py                       |  4 ++--
 internlm/utils/model_checkpoint.py            | 17 ++++++++--------
 internlm/utils/parallel.py                    | 18 ++++++++---------
 6 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 44111cb9..d603539b 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -38,6 +38,7 @@
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.parallel import is_using_isp
 from internlm.utils.timeout import llm_timeout
 
 from .base_optimizer import BaseOptimizer
@@ -85,10 +86,7 @@ def __init__(
         clip_grad_norm = zero_cfg.clip_grad_norm
         self._overlap_sync_grad = zero_cfg.overlap_sync_grad
         self._overlap_sync_param = zero_cfg.overlap_sync_param
-        self.use_isp = (
-            isinstance(gpc.config.parallel["tensor"], dict)
-            and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp"
-        )
+        self.use_isp = is_using_isp()
 
         super().__init__(optim=optimizer)
 
diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py
index ff707a42..ffa06477 100644
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@@ -22,6 +22,7 @@
     is_tensor_data_parallel_parameter,
     is_tensor_expert_data_parallel_parameter,
     is_tensor_zero_parallel_parameter,
+    is_using_isp,
     is_weight_zero_parallel_parameter,
 )
 
@@ -312,9 +313,7 @@ def compute_norm(
         Total norm of the parameters, need total_norm**(1/norm) before using.
     """
 
-    weight_parallel_mode = (
-        ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR
-    )
+    weight_parallel_mode = ParallelMode.WEIGHT if is_using_isp() else ParallelMode.TENSOR
     enable_cuda_kernels = gradients[0].device.type == "cuda"
     # Norm parameters.
     norm_type = float(norm_type)
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 62a9d060..2fe61b7d 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -78,6 +78,7 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
+    is_using_isp,
     is_replica_zero_parallel_parameter,
     is_tensor_data_parallel_parameter,
     is_tensor_expert_data_parallel_parameter,
@@ -105,8 +106,6 @@ def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]):
 
 
 def set_parallel_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]):
-    tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp")
-
     def _check_module(module):
         # layer_norm
         if isinstance(module, (RMSNorm, nn.LayerNorm)):
@@ -120,9 +119,9 @@ def _check_module(module):
         # embedding and head
         if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)):
             for param in module.parameters():
-                if gpc.is_initialized(ParallelMode.TENSOR) and tp_mode == "isp":
+                if gpc.is_initialized(ParallelMode.TENSOR) and is_using_isp():
                     setattr(param, IS_TENSOR_DATA_PARALLEL, True)
-                elif gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp":
+                elif gpc.is_initialized(ParallelMode.TENSOR) and not is_using_isp():
                     setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
 
         # for linear module
@@ -131,9 +130,9 @@ def _check_module(module):
                 if gpc.is_initialized(ParallelMode.EXPERT_DATA) and is_moe_param(param):
                     # module should be MoE experts's linear
                     setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True)
-                elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp":
+                elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.TENSOR) and not is_using_isp():
                     setattr(param, IS_TENSOR_ZERO_PARALLEL, True)
-                elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.WEIGHT) and tp_mode == "isp":
+                elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.WEIGHT) and is_using_isp():
                     setattr(param, IS_WEIGHT_ZERO_PARALLEL, True)
 
     if not isinstance(model, nn.ModuleList):
@@ -208,9 +207,7 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f
 
     # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
     # state in the same dp group are all the same.
-    random_mode = (
-        ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA
-    )
+    random_mode = ParallelMode.WEIGHT_DATA if is_using_isp() else ParallelMode.DATA
     set_mode(random_mode)
 
     # if fsdp enabled, wrap the model
@@ -262,9 +259,8 @@ def initialize_isp_communicator(model: Union[nn.Module, nn.ModuleList]):
     Returns:
         An isp communicator for managing comp/comm overlap and memory pool.
     """
-    if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp":
-        isp_communicator = None
-    else:
+    isp_communicator = None
+    if is_using_isp():
         isp_communicator = ISPCommunicator(
             model,
             ISPCommModelConfig(
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 2f57f11a..4980255a 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -5,7 +5,7 @@
 from internlm.core.context.parallel_context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.model.utils import is_moe_param
-from internlm.utils.parallel import is_tensor_data_parallel_parameter
+from internlm.utils.parallel import is_tensor_data_parallel_parameter, is_using_isp
 
 
 def split_params_into_different_groups_for_optimizer(
@@ -39,7 +39,7 @@ def split_params_into_different_groups_for_optimizer(
 
     # create new groups for IS_TENSOR_DATA_PARALLEL parameter group
     new_groups = {}
-    if isinstance(gpc.config.parallel["tensor"], dict) and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+    if is_using_isp():
         new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA}
     # create new groups for fp32 parameter group
     new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1}
diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
index 962f6415..9ace19ab 100644
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@@ -30,6 +30,7 @@
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.parallel import is_using_isp
 from internlm.utils.storage_manager import (
     get_fns,
     get_storage_manager,
@@ -325,7 +326,7 @@ def save_model_checkpoint(folder, model):
         # even if pp is not considered, it will definitely not be written on the same machine.
 
         # for tensor parallel mode with isp
-        if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+        if is_using_isp():
             if wdp_rank == 0 or dp_rank == 0:
                 fn = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt"
                 fp = os.path.join(folder, fn)
@@ -564,7 +565,7 @@ def load_model_checkpoint(folder, model):
     for fn in fns:
         if fn.startswith("model_t") and not fn.endswith(".md5"):
             segements = os.path.splitext(fn)[0].split("_")
-            if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+            if is_using_isp():
                 max_pp = max(max_pp, int(segements[-1][2:]))
                 max_wp = max(max_wp, int(segements[-2][2:]))
                 max_tp = max(max_tp, int(segements[-3][2:]))
@@ -590,7 +591,7 @@ def load_model_checkpoint(folder, model):
             dp_size == max_zo + 1
         ), f"The weights are save for {max_zo+1} FSDP shards , while current has {dp_size} FSDP shards"
 
-    if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+    if is_using_isp():
         should_load_name = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt"
     elif gpc.config.parallel.zero1.fsdp:
         should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
@@ -702,7 +703,7 @@ def save_optimizer_checkpoint(optim, state_path):
 
     states = optim.state_dict()
     if isinstance(optim, HybridZeroOptimizer):
-        if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+        if is_using_isp():
             fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
             llm_save(os.path.join(state_path, fp), states)
         else:
@@ -752,7 +753,7 @@ def load_optimizer_checkpoint(folder, optim):
     max_tp, max_wp, max_pp, max_zero, max_dp = 0, 0, 0, 0, 0
     for fn in fns:
         if fn.startswith("optimizer_") and not fn.endswith(".md5"):
-            if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+            if is_using_isp():
                 _, tp, wp, pp, dp = os.path.splitext(fn)[0].split("_")
                 max_dp = max(max_dp, int(dp[2:]))
                 max_tp = max(max_tp, int(tp[2:]))
@@ -770,12 +771,12 @@ def load_optimizer_checkpoint(folder, optim):
     pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
     dp_size = gpc.get_world_size(ParallelMode.DATA)
 
-    if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+    if is_using_isp():
         assert dp_size == max_dp + 1, (
             f"The optimizer states are save for {max_dp+1} data parallelism, "
             f"while current has {dp_size} data parallelism"
         )
-    if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp":
+    if not is_using_isp():
         assert zero_size == max_zero + 1, (
             f"The optimizer states are save for {max_zero+1} zero parallel, "
             f"while current has {zero_size} zero broadcast range."
@@ -795,7 +796,7 @@ def load_optimizer_checkpoint(folder, optim):
     wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
     pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
     dp_rank = gpc.get_local_rank(ParallelMode.DATA)
-    if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp":
+    if is_using_isp():
         fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt"
     else:
         fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt"
diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py
index 5a491d33..76cd8d95 100644
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@@ -20,6 +20,10 @@
 RMSNorm = try_import_RMSNorm()
 
 
+def is_using_isp():
+    return isinstance(gpc.config.parallel["tensor"], dict) and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp"
+
+
 def is_replica_zero_parallel_parameter(p):
     return hasattr(p, IS_REPLICA_ZERO_PARALLEL) and getattr(p, IS_REPLICA_ZERO_PARALLEL)
 
@@ -27,7 +31,7 @@ def is_replica_zero_parallel_parameter(p):
 def is_tensor_data_parallel_parameter(p):
     return (
         gpc.is_initialized(ParallelMode.TENSOR)
-        and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp"
+        and is_using_isp()
         and hasattr(p, IS_TENSOR_DATA_PARALLEL)
         and getattr(p, IS_TENSOR_DATA_PARALLEL)
     )
@@ -36,7 +40,7 @@ def is_tensor_data_parallel_parameter(p):
 def is_tensor_zero_parallel_parameter(p):
     return (
         gpc.is_initialized(ParallelMode.TENSOR)
-        and gpc.config.parallel["tensor"].get("mode", "mtp") != "isp"
+        and not is_using_isp()
         and hasattr(p, IS_TENSOR_ZERO_PARALLEL)
         and getattr(p, IS_TENSOR_ZERO_PARALLEL)
     )
@@ -45,7 +49,7 @@ def is_tensor_zero_parallel_parameter(p):
 def is_weight_zero_parallel_parameter(p):
     return (
         gpc.is_initialized(ParallelMode.WEIGHT)
-        and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp"
+        and is_using_isp()
         and hasattr(p, IS_WEIGHT_ZERO_PARALLEL)
         and getattr(p, IS_WEIGHT_ZERO_PARALLEL)
     )
@@ -67,9 +71,7 @@ def sync_model_param(model):
     """
 
     sync_moe_param = gpc.is_using_parallel_mode(ParallelMode.EXPERT_DATA)
-    sync_parallel_mode = (
-        ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA
-    )
+    sync_parallel_mode = ParallelMode.WEIGHT_DATA if is_using_isp() else ParallelMode.DATA
     for param in model.parameters():
         if sync_moe_param and getattr(param, "is_expert", False):
             ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA)
@@ -90,9 +92,7 @@ def sync_model_replica_param_group(model):
         model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
     """
 
-    parallel_mode = (
-        ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR
-    )
+    parallel_mode = ParallelMode.WEIGHT if is_using_isp() else ParallelMode.TENSOR
     if gpc.is_using_parallel_mode(parallel_mode):
         for param in model.parameters():
             if is_replica_zero_parallel_parameter(param):

From f02523edd5f510ba6916c690639c10c7683a54e9 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Mon, 29 Jan 2024 17:05:44 +0800
Subject: [PATCH 146/153] fix(tests): fix ci tests error

---
 internlm/train/training_internlm.py                   | 2 +-
 tests/test_training/test_loss.py                      | 2 +-
 tests/test_training/test_swap_nb_loss_and_gradnorm.py | 2 +-
 tests/test_training/train_CI.py                       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 2fe61b7d..4bcf2e9c 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -78,11 +78,11 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
-    is_using_isp,
     is_replica_zero_parallel_parameter,
     is_tensor_data_parallel_parameter,
     is_tensor_expert_data_parallel_parameter,
     is_tensor_zero_parallel_parameter,
+    is_using_isp,
     is_weight_zero_parallel_parameter,
     set_model_params_layer_name,
     sync_model_param,
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index 7e694d57..a3b3b442 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -93,7 +93,7 @@ def train(
     current_time = objs[0]
 
     # initialize model
-    model, _ = initialize_model()
+    model = initialize_model()
 
     # initialize loss function
     criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
index 4d8afa28..873d2ff6 100644
--- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py
+++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
@@ -278,7 +278,7 @@ def exam_loss(args):
     seed_all(1024)
 
     # initialize model
-    model, _ = initialize_model()
+    model = initialize_model()
 
     # initialize loss function
     criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py
index a985b985..39c98781 100644
--- a/tests/test_training/train_CI.py
+++ b/tests/test_training/train_CI.py
@@ -124,7 +124,7 @@ def main(args):
     uniscale_logger = initialize_llm_logger(start_time=current_time)
 
     # initialize model
-    model, _ = initialize_model()
+    model = initialize_model()
 
     with open(args.config, "r") as f:
         config_lines = f.readlines()

From 23ab67f0860d4567b469698902864226066f8f12 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 30 Jan 2024 11:07:02 +0800
Subject: [PATCH 147/153] feat(model/modeling_llama.py): update model llama

---
 internlm/model/modeling_llama.py | 143 +++++++++++++++++++++----------
 1 file changed, 97 insertions(+), 46 deletions(-)

diff --git a/internlm/model/modeling_llama.py b/internlm/model/modeling_llama.py
index 5f999c8f..00529796 100644
--- a/internlm/model/modeling_llama.py
+++ b/internlm/model/modeling_llama.py
@@ -9,6 +9,7 @@
 
 from internlm.core.context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
+from internlm.core.naive_amp import set_output_attr_to_module
 from internlm.initialize.initialize_tensor import (
     normal_,
     scaled_init_method_normal,
@@ -17,13 +18,18 @@
 )
 from internlm.model.embedding import Embedding1D, RotaryEmbedding
 from internlm.model.linear import (
-    ColumnParallelLinearTorch,
-    FeedForward,
+    MegatronScaleColumnParallelLinear,
     RewardModelLinear,
-    RowParallelLinearTorch,
     ScaleColumnParallelLinear,
+    get_linear_cls,
+    get_mlp_cls,
+)
+from internlm.model.multi_head_attention import DistributedAttention
+from internlm.model.utils import (
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+    try_import_RMSNorm,
 )
-from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm
 from internlm.solver.pipeline_utils import partition_uniform
 from internlm.utils.checkpoint import activation_checkpoint
 from internlm.utils.common import filter_kwargs
@@ -59,24 +65,25 @@ class MHA(nn.Module):
     Args:
         embed_dim (int): The dimention of hidden state.
         num_heads (int): The number of attention heads.
-        num_kv_heads (int): The number of kv attention heads.
         process_group (torch.distributed.ProcessGroup): The group of the current device for `parallel_mode`.
+        sequence_process_group (torch.distributed.ProcessGroup): The process group for attention calculation.
         bias (boolean): Whether the bias is needed for linears. Will be used when initializing QKV matrix and
                         output projection. True by default.
         dropout (float): The dropout rate for cross attention and self attention. 0.0 by default.
         softmax_scale (float): The temperature to use for the softmax attention.
         causal (boolean): Whether to apply causal attention mask. False by default.
         layer_idx (int): The index of current layer. None by default.
-        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
         rotary_emb_dim (int): The dimention of Rotary Embedding. 0 by default.
         rotary_emb_scale_base (int): The scaling factor of Rotary Embedding. If scale_base > 0, this implements
                                     XPos(Sun et al., https://arxiv.org/abs/2212.10554). 0 by default.
         use_flash_attn (boolean): Whether to use flash attention or not.If False, vanilla attention module will be used.
-                                    True by default.
+                                    False by default.
         device (Optional[Union[str, torch.device]]): The device will be used.
         dtype (Optional[torch.dtype]): The type of data.
-        rot_embed_HF_impl: rotary embedding hf implementation. False by default.
-
+        use_flash_attn (bool): Whether to use flash-attn. True by default.
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        tp_mode (str): The string value of tensor parallel mode, should be in ["mtp", "msp", "fsp", "isp"],
+                       "mtp" by default.
 
     """
 
@@ -86,6 +93,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         process_group: Optional[torch.distributed.ProcessGroup],
+        sequence_process_group: Optional[torch.distributed.ProcessGroup],
         bias: bool = True,
         dropout: float = 0.0,
         softmax_scale: float = None,
@@ -98,6 +106,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         rot_embed_HF_impl: Optional[bool] = False,
+        tp_mode: str = "mtp",
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
@@ -113,6 +122,7 @@ def __init__(
         self.rotary_emb_dim = rotary_emb_dim
         self.use_flash_attn = use_flash_attn
         self.dtype = dtype
+        self.tp_mode = tp_mode
 
         self.rot_embed_HF_impl = rot_embed_HF_impl
         sequence_parallel = gpc.config.parallel.get("sequence_parallel", False)
@@ -122,8 +132,9 @@ def __init__(
                 self.rotary_emb_dim, base=rope_base, scale_base=rotary_emb_scale_base, device=device
             )
 
+        Wqkv_cls = get_linear_cls(self.tp_mode, "column")
         # notice here should change bias=True
-        self.wq = ColumnParallelLinearTorch(
+        self.wq = Wqkv_cls(
             embed_dim,
             embed_dim,
             process_group,
@@ -131,7 +142,7 @@ def __init__(
             sequence_parallel=sequence_parallel,
             **factory_kwargs,
         )
-        self.wk = ColumnParallelLinearTorch(
+        self.wk = Wqkv_cls(
             embed_dim,
             self.kv_dim,
             process_group,
@@ -139,7 +150,7 @@ def __init__(
             sequence_parallel=sequence_parallel,
             **factory_kwargs,
         )
-        self.wv = ColumnParallelLinearTorch(
+        self.wv = Wqkv_cls(
             embed_dim,
             self.kv_dim,
             process_group,
@@ -159,8 +170,13 @@ def __init__(
         self.inner_cross_attn_softmax_scale = softmax_scale
         self.inner_cross_attn_dropout = dropout
 
+        self.attn = flash_attn_varlen_kvpacked_func
+        if self.tp_mode == "isp":
+            self.attn = DistributedAttention(self.attn, sequence_process_group=sequence_process_group)
+
         # output projection always have the bias (for now)
-        self.wo = RowParallelLinearTorch(
+        out_proj_cls = get_linear_cls(self.tp_mode, "row")
+        self.wo = out_proj_cls(
             embed_dim,
             embed_dim,
             process_group,
@@ -421,7 +437,7 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
                 if kv.dtype not in [torch.float16, torch.bfloat16]:
                     kv = kv.to(torch.bfloat16)
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                    context = flash_attn_varlen_kvpacked_func(
+                    context = self.attn(
                         q=q,
                         kv=kv,
                         cu_seqlens_q=kwargs["cu_seqlens"],
@@ -433,7 +449,7 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
                         causal=self.inner_cross_attn_causal,
                     ).to(self.dtype)
             else:
-                context = flash_attn_varlen_kvpacked_func(
+                context = self.attn(
                     q=q,
                     kv=kv,
                     cu_seqlens_q=kwargs["cu_seqlens"],
@@ -446,7 +462,6 @@ def _packed_forward(self, x, inference_params=None, **kwargs):
                 )
         else:
             raise RuntimeError("Not support this right now")
-
         context = rearrange(context, "b h d -> b (h d)")  # recover shape
         out = self.wo(context)
         return out
@@ -459,7 +474,6 @@ class PackedFlashLlamaLayer1D(nn.Module):
     Args:
         hidden_size (int): The hidden size of model. 768 by default.
         num_attention_heads (int): The number of attention heads. 12 by default.
-        num_kv_attention_heads (int): The number of kv attention heads. 12 by default.
         mlp_ratio (int): The ratio of MLP layers. 4 by default.
         attn_drop_rate (float): The dropout rate of attention module. 0 by default.
         drop_rate (float): The dropout rate of the input hidden state. 0.0 by default.
@@ -469,14 +483,7 @@ class PackedFlashLlamaLayer1D(nn.Module):
         layer_idx (int): The index of current layer. 0 by default.
         residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
         device (Optional[Union[str, torch.device]]): The device will be used.
-        apply_post_layer_norm (bool): Whether use post layer norm. False by default.
-        fused_dropout_add_ln (bool): Whether use fused dropout add ln. True by default.
-        no_bias (bool): Whether remove bias. False by default.
         norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
-        adapt_hf (bool): Whether adapt hf. False by default.
-        dropout_selective_checkpoint (bool): Whether use dropout selective checkpoint. True by default.
-        use_scaled_init (bool): Whether use scaled init. True by default.
-        use_swiglu (bool): Whether use swiglu. True by default.
         use_flash_attn (bool): Whether use flash-attn. True by default.
         attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default,
         attn_other_init_std (float): std used to init attn_other weight. 0.02 by default,
@@ -485,6 +492,8 @@ class PackedFlashLlamaLayer1D(nn.Module):
         ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default,
         init_type (str): Initialization type. Use uniform or normal. "normal" by default,
         rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        tp_mode (str): The string value of tensor parallel mode, should be in ["mtp", "msp", "fsp", "isp"],
+                       "mtp" by default.
     """
 
     def __init__(
@@ -516,6 +525,7 @@ def __init__(
         ffn_other_init_std: float = 0.02,
         init_type: str = "normal",
         rope_base: int = 10000,
+        tp_mode: str = "mtp",
     ):
         super().__init__()
         self.checkpoint = checkpoint
@@ -532,11 +542,15 @@ def __init__(
         self.ffn_other_init_std = ffn_other_init_std
 
         head_dim = hidden_size // num_attention_heads
+        self.tp_mode = tp_mode
+        parallel_mode = ParallelMode.WEIGHT if self.tp_mode == "isp" else ParallelMode.TENSOR
+
         self.attention = MHA(
             embed_dim=hidden_size,
             num_heads=num_attention_heads,
             num_kv_heads=num_kv_attention_heads,
-            process_group=gpc.get_group(ParallelMode.TENSOR),
+            process_group=gpc.get_group(parallel_mode),
+            sequence_process_group=gpc.get_group(ParallelMode.TENSOR),
             dropout=attn_drop_rate,
             softmax_scale=1 / math.sqrt(head_dim),
             causal=True,
@@ -549,6 +563,7 @@ def __init__(
             rot_embed_HF_impl=adapt_hf,
             bias=not no_bias,
             rope_base=rope_base,
+            tp_mode=self.tp_mode,
         )
 
         self.dropout1 = nn.Dropout(drop_rate)
@@ -564,11 +579,12 @@ def __init__(
 
         sequence_parallel = gpc.config.parallel.get("sequence_parallel", False)
         if use_swiglu:
-            self.feed_forward = FeedForward(
+            mlp_cls = get_mlp_cls(self.tp_mode)
+            self.feed_forward = mlp_cls(
                 hidden_size,
                 int(hidden_size * mlp_ratio),
                 out_features=hidden_size,
-                process_group=gpc.get_group(ParallelMode.TENSOR),
+                process_group=gpc.get_group(parallel_mode),
                 bias=False,
                 device=device,
                 dtype=dtype,
@@ -579,7 +595,7 @@ def __init__(
                 int(hidden_size * mlp_ratio),
                 out_features=hidden_size,
                 activation="gelu_approx",
-                process_group=gpc.get_group(ParallelMode.TENSOR),
+                process_group=gpc.get_group(parallel_mode),
                 bias1=False,
                 bias2=False,
                 sequence_parallel=sequence_parallel,
@@ -731,7 +747,6 @@ class PackedFlashLlama1D(nn.Module):
         num_layers (int): The number of layer. 12 by default.
         hidden_size (int): The size of hidden state. 768 by default.
         num_attention_heads (int): The number of attention head. 12 by default.
-        num_kv_attention_heads (int): The number of kv attention head. 12 by default.
         vocab_size (int): The size of vocabulary. 50304 by default.
         mlp_ratio (int): The ratio of MLP layers. 4 by default.
         attn_drop_rate (float): The dropout rate of attention module. 0.0 by default.
@@ -749,15 +764,8 @@ class PackedFlashLlama1D(nn.Module):
         parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
         start_layer_idx (int): The index of start layer in the pipeline. 0 by default.
         device (Optional[Union[str, torch.device]]): The device will be used. None by default.
-        apply_post_layer_norm (bool): Whether use post layer norm. False by default.
-        no_bias (bool): Whether remove bias. False by default.
         residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
         norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
-        adapt_hf (bool): Whether adapt hf. False by default.
-        is_reward (bool): Whether use is_reward. False by default.
-        dropout_selective_checkpoint (bool): Whether dropout selective checkpoint. True by default.
-        use_scaled_init (bool): Whether use scaled init. True by default.
-        use_swiglu (bool): Whether use swiglu. True by default.
         use_flash_attn (bool): Whether to use flash-attn. True by default.
         embedding_init_std (float): std used to init embedding weight. 0.02 by default,
         attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default,
@@ -767,6 +775,7 @@ class PackedFlashLlama1D(nn.Module):
         ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default,
         out_head_init_std (float): std used to init output lmhead weight. 0.02 by default,
         init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        extra_pred_tokens (int): The number of extra output head for multi-token-prediction. 0 by default.
         rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
     """
 
@@ -808,6 +817,7 @@ def __init__(
         ffn_other_init_std: float = 0.02,
         out_head_init_std: float = 0.02,
         init_type: str = "normal",
+        extra_pred_tokens: int = 0,
         rope_base: int = 10000,
     ):
         super().__init__()
@@ -819,10 +829,18 @@ def __init__(
             checkpoint_fraction = 0
         checkpoint_layer_num = num_layers * checkpoint_fraction
         sequence_parallel = gpc.config.parallel.get("sequence_parallel", False)
+        self.tp_mode = "mtp"
+        if isinstance(gpc.config.parallel["tensor"], dict):
+            self.tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp")
+
         if is_reward:
             head_cls = RewardModelLinear
         else:
-            head_cls = ScaleColumnParallelLinear
+            head_cls = (
+                ScaleColumnParallelLinear
+                if self.tp_mode in ["mtp", "fsp", "isp"]
+                else MegatronScaleColumnParallelLinear
+            )
         if first:
             if embed_split_hidden:
                 self.tok_embeddings = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
@@ -874,6 +892,7 @@ def __init__(
                     ffn_other_init_std=ffn_other_init_std,
                     init_type=init_type,
                     rope_base=rope_base,
+                    tp_mode=self.tp_mode,
                 )
                 for lid in range(num_layers)
             ]
@@ -895,13 +914,36 @@ def __init__(
                 dtype=dtype,
                 weight_scale=embed_grad_scale,
             )
-
+            set_output_attr_to_module(self.output)
             for _, param in self.output.named_parameters():
                 if init_type == "normal":
                     normal_(std=out_head_init_std)(param)
                 else:
                     uniform_(std=out_head_init_std)(param)
 
+            if extra_pred_tokens > 0:
+                self.extra_pred_tokens = extra_pred_tokens
+                assert not is_reward, "extra_pred_tokens > 0 means using multi token prediction, not implement for RLHF"
+                self.extra_outputs = nn.ModuleList(
+                    [
+                        head_cls(
+                            in_features=hidden_size,
+                            out_features=vocab_size,
+                            process_group=gpc.get_group(ParallelMode.TENSOR),
+                            bias=False,
+                            device=device,
+                            dtype=dtype,
+                            weight_scale=embed_grad_scale,
+                        )
+                        for _ in range(self.extra_pred_tokens)
+                    ]
+                )
+                for _, param in self.extra_outputs.named_parameters():
+                    if init_type == "normal":
+                        normal_(std=out_head_init_std)(param)
+                    else:
+                        uniform_(std=out_head_init_std)(param)
+
         self.parallel_output = parallel_output
 
     def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
@@ -925,6 +967,10 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
             assert len(indexes) == 1
             # The indexes are used to indicate the actual position IDs of each token in the packed input.
             indexes = indexes[0]
+            # if the sequence parallel mode is 'isp', the indexes should also be split in sequence dimension.
+            if gpc.config.parallel.sequence_parallel and self.tp_mode == "isp":
+                indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0)
+
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None
 
         for _, block in enumerate(self.layers):
@@ -939,10 +985,16 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
 
         if hasattr(self, "norm"):
             hidden_states = self.norm(hidden_states.float())
-
-        extra_hidden_states_list = None
+        if hasattr(self, "extra_pred_tokens") and self.extra_pred_tokens > 0:
+            extra_hidden_states_list = [self.extra_outputs[i](hidden_states) for i in range(self.extra_pred_tokens)]
+        else:
+            extra_hidden_states_list = None
         if hasattr(self, "output"):
-            hidden_states = self.output(hidden_states)
+            # Evaluation
+            if gpc.is_evaluating is True:
+                hidden_states = self.output(hidden_states, gather_dim=1)
+            else:  # Training
+                hidden_states = self.output(hidden_states, gather_dim=0)
 
         if not self.parallel_output:
             hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
@@ -977,7 +1029,7 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"),
         logger.info(f"The layer sharding is {all_parts}.")
 
     models = []
-    kwargs["checkpoint_fraction"] = 1.0
+    kwargs["checkpoint_fraction"] = float(kwargs.get("checkpoint", False))
     start_idx, end_idx = 0, 0
     for start, end in parts:
         start_idx, end_idx = start, end
@@ -1035,6 +1087,7 @@ def build_model_with_cfg(
     ffn_other_init_std: float = 0.02,
     out_head_init_std: float = 0.02,
     init_type: str = "normal",
+    extra_pred_tokens: int = 0,
     rope_base: int = 10000,
 ):
     """
@@ -1052,18 +1105,14 @@ def build_model_with_cfg(
         embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
         parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
         num_attention_heads (int): The number of attention head. 32 by default.
-        num_kv_attention_heads (int): The number of kv attention head. None by default.
         mlp_ratio (int): The ratio of MLP layers. 4.0 by default.
         residual_in_fp32 (bool): Whether to use residual in fp32. False by default. It cannot be used temporarily
                                  because this parameter requires inconsistent data types to be passed between pipelines,
                                  which requires significant modifications to internlm.
         norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
-        adapt_hf (bool): Whether adapt hf. False by default.
         drop_rate (float): The dropout rate of input hidden state. 0 by default.
         attn_drop_rate (float): The dropout rate of attention module. 0 by default.
         apply_post_layer_norm (bool): Whether to apply post layer norm. False by default.
-        no_bias (bool): Whether remove bias. False by default.
-        deepnorm (bool): Whether us deepnorm. False by default.
         layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
         is_reward (bool): Whether to use reward model. False by default.
         dropout_selective_checkpoint (bool): It can only be enabled when checkpoint is disabled. True by default.
@@ -1078,6 +1127,7 @@ def build_model_with_cfg(
         ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default,
         out_head_init_std (float): std used to init output lmhead weight. 0.02 by default,
         init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        extra_pred_tokens (int): The number of extra output head for multi-token-prediction. 0 by default.
         rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
     """
     if deepnorm:
@@ -1114,6 +1164,7 @@ def build_model_with_cfg(
         ffn_other_init_std=ffn_other_init_std,
         out_head_init_std=out_head_init_std,
         init_type=init_type,
+        extra_pred_tokens=extra_pred_tokens,
         rope_base=rope_base,
     )
 

From f11422e2d07c463f690212a85370914c7f72e436 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 30 Jan 2024 14:37:52 +0800
Subject: [PATCH 148/153] feat(model/utils.py): simplify code

---
 internlm/model/linear.py |  18 ++--
 internlm/model/utils.py  | 224 +++++++++------------------------------
 2 files changed, 60 insertions(+), 182 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 9506f608..9d77bb34 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -12,9 +12,9 @@
 from internlm.core.context import global_context as gpc
 from internlm.model.utils import (
     Silu,
-    fused_dense_func_torch,
+    fused_dense_func,
     isp_fused_dense_func,
-    megatron_fused_dense_func_torch,
+    megatron_fused_dense_func,
 )
 
 
@@ -67,7 +67,7 @@ def forward(self, input, gather_dim=0):  # pylint: disable=W0622
             weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
         else:
             weight = self.weight
-        return fused_dense_func_torch(
+        return fused_dense_func(
             input,
             weight,
             self.bias,
@@ -90,7 +90,7 @@ def forward(self, input, gather_dim=0):  # pylint: disable=W0622
             weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
         else:
             weight = self.weight
-        return megatron_fused_dense_func_torch(
+        return megatron_fused_dense_func(
             input,
             weight,
             self.bias,
@@ -140,7 +140,7 @@ def forward(self, input):  # pylint: disable=W0622
             weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
         else:
             weight = self.weight
-        return fused_dense_func_torch(
+        return fused_dense_func(
             input,
             weight,
             self.bias,
@@ -154,7 +154,7 @@ def forward(self, x, gather_dim=0):
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
         # we do an all_gather of x before doing the matmul.
         # If not, then the input is already gathered.
-        return fused_dense_func_torch(
+        return fused_dense_func(
             x,
             self.weight,
             self.bias,
@@ -169,7 +169,7 @@ def forward(self, x, gather_dim=0):
         # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
         # we do an all_gather of x before doing the matmul.
         # If not, then the input is already gathered.
-        return megatron_fused_dense_func_torch(
+        return megatron_fused_dense_func(
             x,
             self.weight,
             self.bias,
@@ -185,7 +185,7 @@ def forward(self, x):
         We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
         a reduce_scatter of the result.
         """
-        out = fused_dense_func_torch(x, self.weight, self.bias)
+        out = fused_dense_func(x, self.weight, self.bias)
         reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
         return reduce_fn(out, self.process_group)
 
@@ -196,7 +196,7 @@ def forward(self, x):
         We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
         a reduce_scatter of the result.
         """
-        out = megatron_fused_dense_func_torch(x, self.weight, self.bias)
+        out = megatron_fused_dense_func(x, self.weight, self.bias)
         reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
         return reduce_fn(out, self.process_group)
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index c6ae7002..c79a04fc 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -184,6 +184,7 @@ def forward(
         process_group=None,
         sequence_parallel=True,
         gather_dim=0,
+        is_using_cuda: bool = True,
     ):
         """
         If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
@@ -194,6 +195,7 @@ def forward(
         ctx.process_group = process_group
         ctx.sequence_parallel = sequence_parallel
         ctx.gather_dim = gather_dim
+        ctx.is_using_cuda = is_using_cuda
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
@@ -233,6 +235,8 @@ def backward(ctx, grad_output, *args):
         sequence_parallel = ctx.sequence_parallel
         gather_dim = ctx.gather_dim
 
+        backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.is_using_cuda else linear_bias_wgrad_torch
+
         if ctx.compute_weight_gradient:
             x, weight = ctx.saved_tensors
             if process_group is not None and sequence_parallel:
@@ -264,7 +268,7 @@ def backward(ctx, grad_output, *args):
             assert ctx.compute_weight_gradient
             if process_group is not None and sequence_parallel:
                 handle_x.wait()
-            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+            grad_weight, grad_bias = backward_func(
                 total_x.reshape(batch_dim, total_x.shape[-1]),
                 grad_output,
                 ctx.needs_input_grad[2],
@@ -274,7 +278,7 @@ def backward(ctx, grad_output, *args):
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
         if process_group is not None and ctx.needs_input_grad[0]:
             handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None
 
 
 class MegatronFusedDenseFunc(torch.autograd.Function):
@@ -295,6 +299,7 @@ def forward(
         process_group=None,
         sequence_parallel=True,
         gather_dim=0,
+        is_using_cuda: bool = True,
     ):
         """
         If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
@@ -304,6 +309,7 @@ def forward(
         ctx.return_residual = return_residual
         ctx.process_group = process_group
         ctx.sequence_parallel = sequence_parallel
+        ctx.is_using_cuda = is_using_cuda
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
@@ -342,115 +348,8 @@ def backward(ctx, grad_output, *args):
         process_group = ctx.process_group
         sequence_parallel = ctx.sequence_parallel
 
-        if ctx.compute_weight_gradient:
-            total_x, weight = ctx.saved_tensors
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            if not ctx.return_residual:
-                grad_input = F.linear(grad_output, weight.t())
-            else:
-                grad_input = torch.addmm(
-                    grad_input.reshape(batch_dim, grad_input.shape[-1]),
-                    grad_output,
-                    weight,
-                )
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
-                total_x.reshape(batch_dim, total_x.shape[-1]),
-                grad_output,
-                ctx.needs_input_grad[2],
-            )
-        else:
-            grad_weight = None
-            grad_bias = grad_output if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None, None
-
-
-# adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
-class FusedDenseFuncTorch(FusedDenseFunc):
-    """FusedDenseFunc in flash implementation for supporting torch.float32"""
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output, *args):
-        grad_output = grad_output.contiguous()
-        if ctx.return_residual:
-            (grad_input,) = args
-            grad_input = grad_input.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
-        gather_dim = ctx.gather_dim
-        if ctx.compute_weight_gradient:
-            x, weight = ctx.saved_tensors
-            if process_group is not None and sequence_parallel:
-                total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim)
-            else:
-                total_x = x
-        else:
-            (weight,) = ctx.saved_tensors
-            total_x = None
-        batch_shape = grad_output.shape[:-1]
-        batch_dim = batch_shape.numel()
-        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
-        if ctx.needs_input_grad[0]:
-            if not ctx.return_residual:
-                grad_input = F.linear(grad_output, weight.t())
-            else:
-                grad_input = torch.addmm(
-                    grad_input.reshape(batch_dim, grad_input.shape[-1]),
-                    grad_output,
-                    weight,
-                )
-            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
-            if process_group is not None:
-                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
-                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
-        else:
-            grad_input = None
-        if ctx.needs_input_grad[1]:
-            assert ctx.compute_weight_gradient
-            if process_group is not None and sequence_parallel:
-                handle_x.wait()
-            # we remove the cuda independence, which is different from flash_attn.
-            grad_weight, grad_bias = linear_bias_wgrad_torch(
-                total_x.reshape(batch_dim, total_x.shape[-1]),
-                grad_output,
-                ctx.needs_input_grad[2],
-            )
-        else:
-            grad_weight = None
-            grad_bias = grad_output if ctx.needs_input_grad[2] else None
-        if process_group is not None and ctx.needs_input_grad[0]:
-            handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None, None
-
-
-class MegatronFusedDenseFuncTorch(FusedDenseFunc):
-    """FusedDenseFunc in megatron implementation for supporting torch.float32"""
+        backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.is_using_cuda else linear_bias_wgrad_torch
 
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output, *args):
-        grad_output = grad_output.contiguous()
-        if ctx.return_residual:
-            (grad_input,) = args
-            grad_input = grad_input.contiguous()
-        process_group = ctx.process_group
-        sequence_parallel = ctx.sequence_parallel
         if ctx.compute_weight_gradient:
             total_x, weight = ctx.saved_tensors
         else:
@@ -476,8 +375,7 @@ def backward(ctx, grad_output, *args):
             grad_input = None
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
-            # we remove the cuda independence, which is different from flash_attn.
-            grad_weight, grad_bias = linear_bias_wgrad_torch(
+            grad_weight, grad_bias = backward_func(
                 total_x.reshape(batch_dim, total_x.shape[-1]),
                 grad_output,
                 ctx.needs_input_grad[2],
@@ -487,7 +385,7 @@ def backward(ctx, grad_output, *args):
             grad_bias = grad_output if ctx.needs_input_grad[2] else None
         if process_group is not None and ctx.needs_input_grad[0]:
             handle_grad_input.wait()
-        return grad_input, grad_weight, grad_bias, None, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None
 
 
 class ISPFusedDenseFunc(torch.autograd.Function):
@@ -503,13 +401,13 @@ def forward(
         module,
         communicator,
         return_residual=False,
-        use_flash_attn: bool = True,
+        is_using_cuda: bool = True,
     ):
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.return_residual = return_residual
         ctx.module = module
         ctx.communicator = communicator
-        ctx.use_flash_attn = use_flash_attn
+        ctx.is_using_cuda = is_using_cuda
 
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
@@ -547,7 +445,7 @@ def backward(ctx, grad_output, *args):
         module = ctx.module
         communicator = ctx.communicator
 
-        backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.use_flash_attn else linear_bias_wgrad_torch
+        backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.is_using_cuda else linear_bias_wgrad_torch
 
         grad_output = grad_output.contiguous()
         if ctx.return_residual:
@@ -606,10 +504,10 @@ def backward(ctx, grad_output, *args):
             if grad_bias is not None and grad_bias_sync is not None:
                 grad_bias_sync.wait()
 
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
-def fused_dense_func_torch(
+def fused_dense_func(
     x: Tensor,
     weight: Tensor,
     bias: Optional[Tensor] = None,
@@ -621,29 +519,20 @@ def fused_dense_func_torch(
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
-    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return FusedDenseFunc.apply(
-            x,
-            weight,
-            bias,
-            return_residual,
-            process_group,
-            sequence_parallel,
-            gather_dim,
-        )
-    else:
-        return FusedDenseFuncTorch.apply(
-            x,
-            weight,
-            bias,
-            return_residual,
-            process_group,
-            sequence_parallel,
-            gather_dim,
-        )
-
-
-def megatron_fused_dense_func_torch(
+    is_using_cuda = x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible
+    return FusedDenseFunc.apply(
+        x,
+        weight,
+        bias,
+        return_residual,
+        process_group,
+        sequence_parallel,
+        gather_dim,
+        is_using_cuda,
+    )
+
+
+def megatron_fused_dense_func(
     x: Tensor,
     weight: Tensor,
     bias: Optional[Tensor] = None,
@@ -655,26 +544,17 @@ def megatron_fused_dense_func_torch(
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
-    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return MegatronFusedDenseFunc.apply(
-            x,
-            weight,
-            bias,
-            return_residual,
-            process_group,
-            sequence_parallel,
-            gather_dim,
-        )
-    else:
-        return MegatronFusedDenseFuncTorch.apply(
-            x,
-            weight,
-            bias,
-            return_residual,
-            process_group,
-            sequence_parallel,
-            gather_dim,
-        )
+    is_using_cuda = x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible
+    return MegatronFusedDenseFunc.apply(
+        x,
+        weight,
+        bias,
+        return_residual,
+        process_group,
+        sequence_parallel,
+        gather_dim,
+        is_using_cuda,
+    )
 
 
 def isp_fused_dense_func(
@@ -688,18 +568,16 @@ def isp_fused_dense_func(
     dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
         x.dtype == torch.float32 and torch.is_autocast_enabled()
     )
-    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
-        return ISPFusedDenseFunc.apply(x, weight, bias, module, communicator, return_residual)
-    else:
-        return ISPFusedDenseFunc.apply(
-            x,
-            weight,
-            bias,
-            module,
-            communicator,
-            return_residual,
-            use_flash_attn=False,
-        )
+    is_using_cuda = x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible
+    return ISPFusedDenseFunc.apply(
+        x,
+        weight,
+        bias,
+        module,
+        communicator,
+        return_residual,
+        is_using_cuda,
+    )
 
 
 def try_import_RMSNorm():

From 8e1ee6fc28d74684dd93eb3e3302b9c5ea7cff2d Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 30 Jan 2024 16:45:07 +0800
Subject: [PATCH 149/153] feat(model/linear.py): update FeedForward class to
 internlm2

---
 internlm/model/linear.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 9d77bb34..9ce91632 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional
+from typing import Callable, Optional
 
 import torch
 from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
@@ -215,6 +215,8 @@ class BaseFeedForward(nn.Module):
         device (Optional[Union[str, torch.device]]): The device will be used.
         dtype (Optional[torch.dtype]): The type of data.
         multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default.
+        column_cls (Optional[Callable]): The column parallel class for w1 and w3. None by default.
+        row_cls (Optional[Callable]): The row parallel class for w2. None by default.
     """
 
     def __init__(
@@ -227,13 +229,13 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         multiple_of: int = 256,
-        colum_cls=None,
-        row_cls=None,
+        column_cls: Optional[Callable] = None,
+        row_cls: Optional[Callable] = None,
     ):
         super().__init__()
         hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
 
-        self.w1 = colum_cls(
+        self.w1 = column_cls(
             in_features,
             hidden_features,
             process_group,
@@ -242,20 +244,20 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        self.w2 = colum_cls(
-            in_features,
+        self.w2 = row_cls(
             hidden_features,
+            out_features,
             process_group,
-            bias,
+            bias=bias,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             device=device,
             dtype=dtype,
         )
-        self.w3 = row_cls(
+        self.w3 = column_cls(
+            in_features,
             hidden_features,
-            out_features,
             process_group,
-            bias=bias,
+            bias,
             sequence_parallel=gpc.config.parallel.sequence_parallel,
             device=device,
             dtype=dtype,
@@ -263,8 +265,8 @@ def __init__(
 
     def forward(self, x):
         w1_o = self.w1(x)
-        w2_o = self.w2(x)
-        out = self.w3(Silu(w1_o, w2_o))
+        w3_o = self.w3(x)
+        out = self.w2(Silu(w1_o, w3_o))
         return out
 
 

From d7928a690a47e2b27c05a0a33fc83adbd91970f5 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 30 Jan 2024 17:24:13 +0800
Subject: [PATCH 150/153] fix(parallel_context.py): fix private repo ci tests
 error

---
 internlm/core/context/parallel_context.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index d597575c..4141b011 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -498,8 +498,10 @@ def init_parallel_groups(self):
         # the user should not set the data parallel size manually
         # instead, it should be calculated based on other parallel config
         self.sequence_parallel_size = self.tensor_parallel_size
-        self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size
-        self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size
+        self.data_parallel_size = max(1, self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size)
+        self.weight_data_parallel_size = max(
+            1, self.world_size // self.pipeline_parallel_size // self.weight_parallel_size
+        )
         if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp":
             if self.zero1_parallel_size == -1:
                 self.zero1_parallel_size = self.weight_data_parallel_size

From 1960dc0d151127abebcdb29e197b108f0d6bc500 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 30 Jan 2024 18:19:28 +0800
Subject: [PATCH 151/153] feat(parallel_context.py): set zero1 parallel size >=
 1

---
 internlm/core/context/parallel_context.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 4141b011..b1c7034d 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -505,6 +505,7 @@ def init_parallel_groups(self):
         if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp":
             if self.zero1_parallel_size == -1:
                 self.zero1_parallel_size = self.weight_data_parallel_size
+            self.zero1_parallel_size = max(1, self.zero1_parallel_size)
             assert (
                 self.zero1_parallel_size <= self.weight_data_parallel_size
             ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}"
@@ -515,6 +516,7 @@ def init_parallel_groups(self):
         else:
             if self.zero1_parallel_size == -1:
                 self.zero1_parallel_size = self.data_parallel_size
+            self.zero1_parallel_size = max(1, self.zero1_parallel_size)
             assert (
                 self.zero1_parallel_size <= self.data_parallel_size
             ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}"

From 62a665d6797cf026faf80383b454a14b62508d3f Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 31 Jan 2024 15:39:10 +0800
Subject: [PATCH 152/153] feat(tests): add e2e test case for isp and enable
 pytorch expandable_segments

---
 .github/workflows/e2e_test.yaml  |  17 ++-
 configs/7B_isp_sft.py            | 200 +++++++++++++++++++++++++++++++
 configs/7B_sft.py                |   2 +-
 internlm/initialize/launch.py    |   3 +
 internlm/utils/common.py         |  12 ++
 tests/test_training/test_loss.py |  24 +++-
 train.py                         |   4 +-
 7 files changed, 257 insertions(+), 5 deletions(-)
 create mode 100644 configs/7B_isp_sft.py

diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index 5c7b4430..965905c7 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -23,4 +23,19 @@ jobs:
     - name: training_8GPU
       run: |
         source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
+
+  training_8GPU_ISP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU_ISP
+      run: |
+        source $evo_env
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py
new file mode 100644
index 00000000..9aac5557
--- /dev/null
+++ b/configs/7B_isp_sft.py
@@ -0,0 +1,200 @@
+JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 2048
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internlm", "llama", "hf_llama".
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = None  # "/path/to/dataset"
+VALID_FOLDER = None  # "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=50000,
+    skip_batches="",
+    # rampup_batch_size (str): A string with three space-separated integers representing the
+    #       starting batch size, the increment, and the number of steps between
+    #       each increment. For example, "192 24 8" means that the batch size (micro_num)
+    #       starts at 192 and increases by 24 every 8 steps. Defaults to None.
+    #       (IMPORTANT): The interval step size is 'micro_bsz'.
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    train_folder=TRAIN_FOLDER,
+    valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=200,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+use_fp32_norm = False
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
+        defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
+        msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
+        fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
+        isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+weight parallel (dict):
+    1. size: int, the size of weight parallel.
+    2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. memory_pool: bool, enable/disable memory pool, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1),
+    tensor=dict(size=2, mode="isp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    weight=dict(size=4, overlap=True, memory_pool=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
+    ),
+)
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 615cd6c3..577fc93c 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -173,7 +173,7 @@
     3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=8, fsdp=False),
+    zero1=dict(size=8),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True, memory_pool=True),
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 61e2eeb4..443a53d6 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -325,6 +325,9 @@ def args_sanity_check():
         gpc.config.parallel["tensor"]["mode"] = "mtp"
     if gpc.config.parallel["tensor"]["mode"] == "isp":
         assert not gpc.config.parallel.zero1.fsdp, "FSDP does not support isp"
+        assert (
+            torch.__version__ >= "2.1.0"
+        ), f"requires torch>=2.1.0 when using isp but current version is {torch.__version__}"
     assert gpc.config.parallel["tensor"].get("mode", None) in [
         "mtp",
         "msp",
diff --git a/internlm/utils/common.py b/internlm/utils/common.py
index 39e2d902..e759e013 100644
--- a/internlm/utils/common.py
+++ b/internlm/utils/common.py
@@ -14,8 +14,10 @@
 import torch
 
 import internlm
+from internlm.utils.logger import get_logger
 
 CURRENT_TIME = None
+logger = get_logger(__file__)
 
 
 def parse_args():
@@ -232,6 +234,16 @@ def get_megatron_flops(
     return tflops
 
 
+def enable_pytorch_expandable_segments():
+    if torch.__version__ >= "2.1.0":
+        _alloc_setting = "expandable_segments:True"
+        if os.getenv("PYTORCH_CUDA_ALLOC_CONF", None) is not None:
+            _alloc_setting = os.getenv("PYTORCH_CUDA_ALLOC_CONF") + "," + _alloc_setting
+        torch.cuda.memory._set_allocator_settings(_alloc_setting)
+    else:
+        logger.warning("To support the 'expandable_segments' configuration, please upgrade torch to version 2.1.0.")
+
+
 class DummyProfile:
     """
     Dummy Profile.
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index a3b3b442..95079812 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -17,7 +17,9 @@
     get_train_data_loader,
     initialize_model,
     initialize_optimizer,
+    initialize_isp_communicator,
     load_new_batch,
+    get_scheduler_hooks,
 )
 from internlm.utils.common import BatchSkipper, launch_time
 from internlm.utils.gputest import empty_cache_and_diag
@@ -46,6 +48,7 @@
 def train(
     dp_size: int = 1,
     tp_size: int = 1,
+    wp_size: int = 1,
     pp_size: int = 1,
     num_chunks: int = 2,
     interleaved: bool = False,
@@ -62,6 +65,9 @@ def train(
     assert (
         gpc.get_world_size(ParallelMode.TENSOR) == tp_size
     ), f"tensor parallel size: {gpc.get_world_size(ParallelMode.TENSOR)} is not as expected {tp_size}"
+    assert (
+        gpc.get_world_size(ParallelMode.WEIGHT) == wp_size
+    ), f"weight parallel size: {gpc.get_world_size(ParallelMode.WEIGHT)} is not as expected {wp_size}"
     assert (
         gpc.get_world_size(ParallelMode.PIPELINE) == pp_size
     ), f"pipeline parallel size: {gpc.get_world_size(ParallelMode.PIPELINE)} is not as expected {pp_size}"
@@ -95,6 +101,9 @@ def train(
     # initialize model
     model = initialize_model()
 
+    # initialize isp communicator
+    isp_communicator = initialize_isp_communicator(model)
+
     # initialize loss function
     criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
 
@@ -104,7 +113,7 @@ def train(
     # initialize and resume train state
     train_state = TrainState(gpc.config, train_dl.batch_sampler)
 
-    optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
+    optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model, isp_communicator)
 
     with open(CONFIG_FILE_PATH, "r") as f:
         config_lines = f.readlines()
@@ -143,6 +152,7 @@ def train(
         ),
     ]
 
+    # initialize trainer
     trainer, train_dl, _, _ = internlm.initialize_trainer(
         model=model,
         optimizer=optimizer,
@@ -150,7 +160,7 @@ def train(
         train_dataloader=train_dl,
         lr_scheduler=lr_scheduler,
         beta2_scheduler=beta2_scheduler,
-        scheduler_hooks=scheduler_hooks,
+        scheduler_hooks=get_scheduler_hooks(metric, optimizer, isp_communicator),
     )
 
     # initialize the batch skipper
@@ -291,3 +301,13 @@ def test_training_loss_with_dp8_pp2_interleaved_overlap():
 
     check_loss_spike()
     check_loss_accuracy()
+
+
+@pytest.mark.training_8GPU_ISP
+def test_training_with_isp():
+    # update config file
+    global CONFIG_FILE_PATH
+    CONFIG_FILE_PATH = "./configs/7B_isp_sft.py"
+
+    # model training
+    train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True)
diff --git a/train.py b/train.py
index 150f5463..490894a9 100644
--- a/train.py
+++ b/train.py
@@ -31,6 +31,7 @@
 )
 from internlm.utils.common import (
     BatchSkipper,
+    enable_pytorch_expandable_segments,
     get_megatron_flops,
     launch_time,
     parse_args,
@@ -69,6 +70,8 @@ def initialize_llm_logger(start_time: str):
 
 
 def main(args):
+    enable_pytorch_expandable_segments()
+
     # init setting
     skip_batches = gpc.config.data.skip_batches
     total_steps = gpc.config.data.total_steps
@@ -155,7 +158,6 @@ def main(args):
     )
 
     # initialize trainer
-
     trainer, train_dl, _, _ = internlm.initialize_trainer(
         model=model,
         optimizer=optimizer,

From e91acb47a4c5456092d20f4ff6d62ea9b22ab586 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Wed, 31 Jan 2024 16:59:43 +0800
Subject: [PATCH 153/153] feat(doc): update doc torch and flashattn version

---
 .github/workflows/e2e_test.yaml |  2 ++
 doc/en/install.md               | 10 +++++-----
 doc/install.md                  | 10 +++++-----
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index 965905c7..a9f6cdc1 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -38,4 +38,6 @@ jobs:
     - name: training_8GPU_ISP
       run: |
         source $evo_env
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
diff --git a/doc/en/install.md b/doc/en/install.md
index f1f3abd7..57df69cf 100644
--- a/doc/en/install.md
+++ b/doc/en/install.md
@@ -5,17 +5,17 @@ The required packages and corresponding version are shown as follows:
 - Python == 3.10
 - GCC == 10.2.0
 - MPFR == 4.1.0
-- CUDA >= 11.7
-- Pytorch >= 1.13.1
+- CUDA >= 11.8
+- Pytorch >= 2.1.0
 - Transformers >= 4.28.0
-- Flash-Attention >= v1.0.5
+- Flash-Attention >= v2.2.1
 - Apex == 23.05
 - GPU with Ampere or Hopper architecture (such as H100, A100)
 - Linux OS
 
 After installing the above dependencies, some system environment variables need to be updated:
 ```bash
-export CUDA_PATH={path_of_cuda_11.7}
+export CUDA_PATH={path_of_cuda_11.8}
 export GCC_HOME={path_of_gcc_10.2.0}
 export MPFR_HOME={path_of_mpfr_4.1.0}
 export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
@@ -47,7 +47,7 @@ pip install -r requirements/torch.txt
 pip install -r requirements/runtime.txt
 ```
 
-Install flash-attention (version v1.0.5):
+Install flash-attention (version v2.2.1):
 ```bash
 cd ./third_party/flash-attention
 python setup.py install
diff --git a/doc/install.md b/doc/install.md
index 3016457d..f6a8588d 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -5,17 +5,17 @@
 - Python == 3.10
 - GCC == 10.2.0
 - MPFR == 4.1.0
-- CUDA >= 11.7
-- Pytorch >= 1.13.1
+- CUDA >= 11.8
+- Pytorch >= 2.1.0
 - Transformers >= 4.28.0
-- Flash-Attention >= v1.0.5
+- Flash-Attention >= v2.2.1
 - Apex == 23.05
 - Ampere或者Hopper架构的GPU (例如H100, A100)
 - Linux OS
 
 以上依赖包安装完成后，需要更新配置系统环境变量：
 ```bash
-export CUDA_PATH={path_of_cuda_11.7}
+export CUDA_PATH={path_of_cuda_11.8}
 export GCC_HOME={path_of_gcc_10.2.0}
 export MPFR_HOME={path_of_mpfr_4.1.0}
 export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
@@ -46,7 +46,7 @@ pip install -r requirements/torch.txt
 pip install -r requirements/runtime.txt
 ```
 
-安装 flash-attention (version v1.0.5)：
+安装 flash-attention (version v2.2.1)：
 ```bash
 cd ./third_party/flash-attention
 python setup.py install