[metax] add mixtral-8x7b flagscale continue train patch (FlagOpen#228)

[metax] add mixtral-8x7b flagscale continue train patch Co-authored-by: sfwang <wangshunfei2010@126.com>
aoyulong · Oct 5, 2024 · 405d753 · 405d753
1 parent 54fb18d
commit 405d753
Showing 1 changed file with 322 additions and 0 deletions.
diff --git a/hardware/metax_C500/237377e9/237377e9.patch b/hardware/metax_C500/237377e9/237377e9.patch
@@ -0,0 +1,322 @@
+From 0b008a5c8581dabee7a06533c2a926199d064546 Mon Sep 17 00:00:00 2001
+From: sfwang <wangshunfei2010@126.com>
+Date: Sat, 5 Oct 2024 07:49:05 +0000
+Subject: [PATCH] add mixtral-8x7b test case
+
+---
+ examples/mixtral/conf/config.yaml             |  58 +++++--
+ .../conf/train/train_mixtral_8x7b.yaml        | 154 +++++++++++-------
+ .../dist_checkpointing/strategies/base.py     |   4 +-
+ .../core/distributed/param_and_grad_buffer.py |   3 +-
+ .../megatron/legacy/fused_kernels/__init__.py |   2 +-
+ megatron/megatron/training/checkpointing.py   |   4 +-
+ 6 files changed, 139 insertions(+), 86 deletions(-)
+
+diff --git a/examples/mixtral/conf/config.yaml b/examples/mixtral/conf/config.yaml
+index 3e8c10f7..2a9e5fc6 100644
+--- a/examples/mixtral/conf/config.yaml
++++ b/examples/mixtral/conf/config.yaml
+@@ -1,24 +1,48 @@
++action: run
+ defaults:
+-  - _self_
+-  - train: train_mixtral_8x7b
+-
++- _self_
++- train: train_mixtral_8x7b
+ experiment:
+-  exp_name: mixtral-8x7b
+-  exp_dir: outputs
+-  task:
+-    type: train
+-    backend: megatron
+-    entrypoint: flagscale/train/train_mixtral.py 
+-  runner:
+-    backend: torchrun
+-    hostfile: <xxxx>
++  cmds:
++    before_start: ''
+   envs:
+-    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
++    CUCC_PATH: /opt/maca/tools/cu-bridge
+     CUDA_DEVICE_MAX_CONNECTIONS: 1
++    CUDA_PATH: /opt/maca/tools/cu-bridge
++    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
++    DEVINFO_ROOT: /opt/maca
++    FORCE_ACTIVATE_WAIT: 1
++    LD_LIBRARY_PATH: /opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/mxdriver/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib
++    MACA_CLANG: /opt/maca/mxgpu_llvm
++    MACA_CLANG_PATH: /opt/maca/mxgpu_llvm/bin
++    MACA_PATH: /opt/maca
++    MACA_SMALL_PAGESIZE_ENABLE: 1
++    MAX_JOBS: 20
++    MCCL_IB_GID_INDEX: 1
++    MCCL_LIMIT_RING_LL_THREADTHRESHOLDS: 1
++    MCCL_MAX_NCHANNELS: 16
++    MCCL_NET_GDR_LEVEL: 7
++    MCCL_P2P_LEVEL: SYS
++    MCPYTORCH_DISABLE_PRINT: 1
+     NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+-
+-action: run
+-
++    NVTE_APPLY_QK_LAYER_SCALING: 0
++    NVTE_FLASH_ATTN: 1
++    NVTE_FUSED_ATTN: 0
++    PATH: /opt/conda/bin:/opt/conda/condabin:/opt/maca/tools/cu-bridge:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/conda/bin:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
++    PYTORCH_ENABLE_SAME_RAND_A100: 1
++    SET_DEVICE_NUMA_PREFERRED: 1
++  exp_dir: /share/project/lyt/pr_tmp/FlagPerf/training/result/run20241005054039/mixtral_8x7B_continuetrain:flagscale:C500:4:8:1/round1/10.1.15.112_noderank0/outputs_mixtral
++  exp_name: mixtral-8x7b
++  runner:
++    backend: torchrun
++    hostfile: /share/project/mixtral_project/tmp_lyt/FlagScale/hostfile
++    nnodes: 4
++    nproc_per_node: 8
++    ssh_port: 1234
++  task:
++    backend: megatron
++    entrypoint: flagscale/train/train_mixtral.py
++    type: train
+ hydra:
+   run:
+-    dir: ${experiment.exp_dir}/hydra 
++    dir: ${experiment.exp_dir}/hydra
+diff --git a/examples/mixtral/conf/train/train_mixtral_8x7b.yaml b/examples/mixtral/conf/train/train_mixtral_8x7b.yaml
+index 9332c5e9..ef1e3f8d 100644
+--- a/examples/mixtral/conf/train/train_mixtral_8x7b.yaml
++++ b/examples/mixtral/conf/train/train_mixtral_8x7b.yaml
+@@ -1,73 +1,101 @@
+-system:
+-  tensor_model_parallel_size: 2
+-  pipeline_model_parallel_size: 4
+-  expert_model_parallel_size: 4
+-  use_mcore_models: true
+-  transformer_impl: transformer_engine
+-  sequence_parallel: true
+-  use_distributed_optimizer: true
+-  precision:
+-    bf16: true
+-  logging:
+-    log_interval: 1
+-    tensorboard_log_interval: 1
+-    wandb_project: mixtral
+-    wandb_exp_name: mixtral-8x7b
+-  checkpoint:
+-    ckpt_format: torch
+-    save_interval: 200
+-
+-
++data:
++  data_path: /metax/dataset/SAMPLE50B/mixtral/mixtral_dataset/dedup-md5-pile-pile-cc_text_document
++  split: 1
++  tokenizer:
++    make_vocab_size_divisible_by: 64
++    tokenizer_path: /metax/dataset/SAMPLE50B/mixtral/mixtral_tokenizer
++    tokenizer_type: QwenTokenizerFS
+ model:
+-  num_layers: 32
+-  hidden_size: 4096
++  attention_dropout: 0.0
++  clip_grad: 1.0
++  disable_bias_linear: true
++  eval_interval: 1000
++  eval_iters: 10
+   ffn_hidden_size: 14336
+-  num_attention_heads: 32
+-  seq_length: 4096
+-  max_position_embeddings: 32768
+-  swiglu: true
+-  normalization: RMSNorm
+-  norm_epsilon: 1e-05
++  global_batch_size: 64
+   group_query_attention: true
+-  num_query_groups: 8
+-  init_method_std: 0.02
+-  attention_dropout: 0.0
+   hidden_dropout: 0.0
+-  disable_bias_linear: true
+-  position_embedding_type: rope
+-  rotary_base: 1000000
+-  no_position_embedding: true
+-  no_masked_softmax_fusion: true
+-  untie_embeddings_and_output_weights: true
+-  # moe args
+-  num_experts: 8
++  hidden_size: 4096
++  init_method_std: 0.02
++  max_position_embeddings: 32768
++  micro_batch_size: 1
++  moe_aux_loss_coeff: 0.02
+   moe_router_load_balancing_type: aux_loss
+   moe_router_topk: 2
+-  moe_aux_loss_coeff: 0.02
+-  # moe_grouped_gemm: true
+-
+-  # seed: 42
+-  train_iters: 1000
+-  micro_batch_size: 1
+-  global_batch_size: 2048
+-  clip_grad: 1.0
+-  eval_interval: 1000
+-  eval_iters: 10
+-
++  no_masked_softmax_fusion: true
++  no_position_embedding: true
++  norm_epsilon: 1e-05
++  normalization: RMSNorm
++  num_attention_heads: 32
++  num_experts: 8
++  num_layers: 32
++  num_query_groups: 8
+   optimizer:
+-    lr: 1e-4
+-    weight_decay: 0.1
++    lr: 1.0e-06
+     lr_scheduler:
+-      min_lr: 1.0e-5
+-      lr_warmup_iters: 500
+       lr_decay_iters: 320000
+       lr_decay_style: cosine
+-
+-
+-data:
+-  data_path: <xxxx>
+-  split: 1
+-  tokenizer:
+-    tokenizer_type: QwenTokenizerFS
+-    tokenizer_path: <xxxx>
+-    make_vocab_size_divisible_by: 64
++      lr_warmup_iters: 5
++      min_lr: 1.0e-07
++    weight_decay: 0.1
++  position_embedding_type: rope
++  rotary_base: 1000000
++  seq_length: 4096
++  swiglu: true
++  train_iters: 10
++  untie_embeddings_and_output_weights: true
++system:
++  checkpoint:
++    ckpt_format: torch
++    finetune: true
++    load: /metax/dataset/mixtral_tp2_pp4_ep4_latest
++    no-load-optim: true
++    no-load-rng: true
++    no_save_optim: true
++    no_save_rng: true
++    recompute_granularity: full
++    recompute_granularity_per_stage:
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    recompute_method: block
++    recompute_method_per_stage:
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    - 1
++    recompute_num_layers: 0
++    recompute_num_layers_per_stage:
++    - 1
++    - 0
++    - 1
++    - 0
++    - 1
++    - 0
++    - 1
++    - 0
++    save: /share/project/lyt/ckpt
++    save_interval: 1000
++  expert_model_parallel_size: 4
++  logging:
++    log_interval: 1
++    tensorboard_log_interval: 1
++    wandb_exp_name: mixtral-8x7b
++    wandb_project: mixtral
++  pipeline_model_parallel_size: 4
++  precision:
++    bf16: true
++  sequence_parallel: true
++  tensor_model_parallel_size: 2
++  transformer_impl: transformer_engine
++  use_distributed_optimizer: true
++  use_mcore_models: true
+diff --git a/megatron/megatron/core/dist_checkpointing/strategies/base.py b/megatron/megatron/core/dist_checkpointing/strategies/base.py
+index cc1c83b9..43b6b95e 100644
+--- a/megatron/megatron/core/dist_checkpointing/strategies/base.py
++++ b/megatron/megatron/core/dist_checkpointing/strategies/base.py
+@@ -20,8 +20,8 @@ class StrategyAction(Enum):
+
+
+ _import_trigger = None
+-default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict)
+-
++#default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict)
++default_strategies = defaultdict(dict)
+ async_calls = AsyncCallsQueue()
+
+
+diff --git a/megatron/megatron/core/distributed/param_and_grad_buffer.py b/megatron/megatron/core/distributed/param_and_grad_buffer.py
+index 77ecd7be..fd2e68a4 100644
+--- a/megatron/megatron/core/distributed/param_and_grad_buffer.py
++++ b/megatron/megatron/core/distributed/param_and_grad_buffer.py
+@@ -2,6 +2,7 @@
+
+ import logging
+ import math
++import numpy
+ import os
+ from enum import Enum
+ from typing import Dict, List, Optional
+@@ -257,7 +258,7 @@ class ParamAndGradBuffer:
+                 # This also helps cuBLAS pick more efficient algorithms for GEMMs.
+                 # We now ensure that all buckets start at a memory address that is 256-byte
+                 # aligned (128 values since params and grads use >= 16-bit precision).
+-                return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128))
++                return _pad(bucket_end_index, numpy.lcm(self.data_parallel_world_size, 128))
+             return bucket_end_index
+
+         def _pad_start_of_param_if_needed(param_start_index: int) -> int:
+diff --git a/megatron/megatron/legacy/fused_kernels/__init__.py b/megatron/megatron/legacy/fused_kernels/__init__.py
+index 87cceac3..5a04def1 100644
+--- a/megatron/megatron/legacy/fused_kernels/__init__.py
++++ b/megatron/megatron/legacy/fused_kernels/__init__.py
+@@ -56,7 +56,7 @@ def load(args):
+
+ def _get_cuda_bare_metal_version(cuda_dir):
+     raw_output = subprocess.check_output(
+-        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
++        [cuda_dir + "/bin/cucc", "-V"], universal_newlines=True
+     )
+     output = raw_output.split()
+     release_idx = output.index("release") + 1
+diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py
+index 6e58b317..20878032 100644
+--- a/megatron/megatron/training/checkpointing.py
++++ b/megatron/megatron/training/checkpointing.py
+@@ -1059,11 +1059,11 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
+     # Model.
+     strict = False if args.retro_add_retriever else strict
+     if len(model) == 1:
+-        model[0].load_state_dict(state_dict['model'], strict=strict)
++        model[0].load_state_dict(state_dict['model'], strict=False)
+     else:
+         for i in range(len(model)):
+             mpu.set_virtual_pipeline_model_parallel_rank(i)
+-            model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
++            model[i].load_state_dict(state_dict['model%d' % i], strict=False)
+
+     # Fix up query/key/value matrix ordering if needed.
+     checkpoint_version = get_checkpoint_version()
+-- 
+2.34.1