Skip to content

Commit

Permalink
[metax] add mixtral-8x7b flagscale continue train patch (FlagOpen#228)
Browse files Browse the repository at this point in the history
[metax] add mixtral-8x7b flagscale continue train patch

Co-authored-by: sfwang <wangshunfei2010@126.com>
  • Loading branch information
lytm00847 and suenphey authored Oct 5, 2024
1 parent 54fb18d commit 405d753
Showing 1 changed file with 322 additions and 0 deletions.
322 changes: 322 additions & 0 deletions hardware/metax_C500/237377e9/237377e9.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
From 0b008a5c8581dabee7a06533c2a926199d064546 Mon Sep 17 00:00:00 2001
From: sfwang <wangshunfei2010@126.com>
Date: Sat, 5 Oct 2024 07:49:05 +0000
Subject: [PATCH] add mixtral-8x7b test case

---
examples/mixtral/conf/config.yaml | 58 +++++--
.../conf/train/train_mixtral_8x7b.yaml | 154 +++++++++++-------
.../dist_checkpointing/strategies/base.py | 4 +-
.../core/distributed/param_and_grad_buffer.py | 3 +-
.../megatron/legacy/fused_kernels/__init__.py | 2 +-
megatron/megatron/training/checkpointing.py | 4 +-
6 files changed, 139 insertions(+), 86 deletions(-)

diff --git a/examples/mixtral/conf/config.yaml b/examples/mixtral/conf/config.yaml
index 3e8c10f7..2a9e5fc6 100644
--- a/examples/mixtral/conf/config.yaml
+++ b/examples/mixtral/conf/config.yaml
@@ -1,24 +1,48 @@
+action: run
defaults:
- - _self_
- - train: train_mixtral_8x7b
-
+- _self_
+- train: train_mixtral_8x7b
experiment:
- exp_name: mixtral-8x7b
- exp_dir: outputs
- task:
- type: train
- backend: megatron
- entrypoint: flagscale/train/train_mixtral.py
- runner:
- backend: torchrun
- hostfile: <xxxx>
+ cmds:
+ before_start: ''
envs:
- CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+ CUCC_PATH: /opt/maca/tools/cu-bridge
CUDA_DEVICE_MAX_CONNECTIONS: 1
+ CUDA_PATH: /opt/maca/tools/cu-bridge
+ CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+ DEVINFO_ROOT: /opt/maca
+ FORCE_ACTIVATE_WAIT: 1
+ LD_LIBRARY_PATH: /opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/mxdriver/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib
+ MACA_CLANG: /opt/maca/mxgpu_llvm
+ MACA_CLANG_PATH: /opt/maca/mxgpu_llvm/bin
+ MACA_PATH: /opt/maca
+ MACA_SMALL_PAGESIZE_ENABLE: 1
+ MAX_JOBS: 20
+ MCCL_IB_GID_INDEX: 1
+ MCCL_LIMIT_RING_LL_THREADTHRESHOLDS: 1
+ MCCL_MAX_NCHANNELS: 16
+ MCCL_NET_GDR_LEVEL: 7
+ MCCL_P2P_LEVEL: SYS
+ MCPYTORCH_DISABLE_PRINT: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-
-action: run
-
+ NVTE_APPLY_QK_LAYER_SCALING: 0
+ NVTE_FLASH_ATTN: 1
+ NVTE_FUSED_ATTN: 0
+ PATH: /opt/conda/bin:/opt/conda/condabin:/opt/maca/tools/cu-bridge:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/conda/bin:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ PYTORCH_ENABLE_SAME_RAND_A100: 1
+ SET_DEVICE_NUMA_PREFERRED: 1
+ exp_dir: /share/project/lyt/pr_tmp/FlagPerf/training/result/run20241005054039/mixtral_8x7B_continuetrain:flagscale:C500:4:8:1/round1/10.1.15.112_noderank0/outputs_mixtral
+ exp_name: mixtral-8x7b
+ runner:
+ backend: torchrun
+ hostfile: /share/project/mixtral_project/tmp_lyt/FlagScale/hostfile
+ nnodes: 4
+ nproc_per_node: 8
+ ssh_port: 1234
+ task:
+ backend: megatron
+ entrypoint: flagscale/train/train_mixtral.py
+ type: train
hydra:
run:
- dir: ${experiment.exp_dir}/hydra
+ dir: ${experiment.exp_dir}/hydra
diff --git a/examples/mixtral/conf/train/train_mixtral_8x7b.yaml b/examples/mixtral/conf/train/train_mixtral_8x7b.yaml
index 9332c5e9..ef1e3f8d 100644
--- a/examples/mixtral/conf/train/train_mixtral_8x7b.yaml
+++ b/examples/mixtral/conf/train/train_mixtral_8x7b.yaml
@@ -1,73 +1,101 @@
-system:
- tensor_model_parallel_size: 2
- pipeline_model_parallel_size: 4
- expert_model_parallel_size: 4
- use_mcore_models: true
- transformer_impl: transformer_engine
- sequence_parallel: true
- use_distributed_optimizer: true
- precision:
- bf16: true
- logging:
- log_interval: 1
- tensorboard_log_interval: 1
- wandb_project: mixtral
- wandb_exp_name: mixtral-8x7b
- checkpoint:
- ckpt_format: torch
- save_interval: 200
-
-
+data:
+ data_path: /metax/dataset/SAMPLE50B/mixtral/mixtral_dataset/dedup-md5-pile-pile-cc_text_document
+ split: 1
+ tokenizer:
+ make_vocab_size_divisible_by: 64
+ tokenizer_path: /metax/dataset/SAMPLE50B/mixtral/mixtral_tokenizer
+ tokenizer_type: QwenTokenizerFS
model:
- num_layers: 32
- hidden_size: 4096
+ attention_dropout: 0.0
+ clip_grad: 1.0
+ disable_bias_linear: true
+ eval_interval: 1000
+ eval_iters: 10
ffn_hidden_size: 14336
- num_attention_heads: 32
- seq_length: 4096
- max_position_embeddings: 32768
- swiglu: true
- normalization: RMSNorm
- norm_epsilon: 1e-05
+ global_batch_size: 64
group_query_attention: true
- num_query_groups: 8
- init_method_std: 0.02
- attention_dropout: 0.0
hidden_dropout: 0.0
- disable_bias_linear: true
- position_embedding_type: rope
- rotary_base: 1000000
- no_position_embedding: true
- no_masked_softmax_fusion: true
- untie_embeddings_and_output_weights: true
- # moe args
- num_experts: 8
+ hidden_size: 4096
+ init_method_std: 0.02
+ max_position_embeddings: 32768
+ micro_batch_size: 1
+ moe_aux_loss_coeff: 0.02
moe_router_load_balancing_type: aux_loss
moe_router_topk: 2
- moe_aux_loss_coeff: 0.02
- # moe_grouped_gemm: true
-
- # seed: 42
- train_iters: 1000
- micro_batch_size: 1
- global_batch_size: 2048
- clip_grad: 1.0
- eval_interval: 1000
- eval_iters: 10
-
+ no_masked_softmax_fusion: true
+ no_position_embedding: true
+ norm_epsilon: 1e-05
+ normalization: RMSNorm
+ num_attention_heads: 32
+ num_experts: 8
+ num_layers: 32
+ num_query_groups: 8
optimizer:
- lr: 1e-4
- weight_decay: 0.1
+ lr: 1.0e-06
lr_scheduler:
- min_lr: 1.0e-5
- lr_warmup_iters: 500
lr_decay_iters: 320000
lr_decay_style: cosine
-
-
-data:
- data_path: <xxxx>
- split: 1
- tokenizer:
- tokenizer_type: QwenTokenizerFS
- tokenizer_path: <xxxx>
- make_vocab_size_divisible_by: 64
+ lr_warmup_iters: 5
+ min_lr: 1.0e-07
+ weight_decay: 0.1
+ position_embedding_type: rope
+ rotary_base: 1000000
+ seq_length: 4096
+ swiglu: true
+ train_iters: 10
+ untie_embeddings_and_output_weights: true
+system:
+ checkpoint:
+ ckpt_format: torch
+ finetune: true
+ load: /metax/dataset/mixtral_tp2_pp4_ep4_latest
+ no-load-optim: true
+ no-load-rng: true
+ no_save_optim: true
+ no_save_rng: true
+ recompute_granularity: full
+ recompute_granularity_per_stage:
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ recompute_method: block
+ recompute_method_per_stage:
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ - 1
+ recompute_num_layers: 0
+ recompute_num_layers_per_stage:
+ - 1
+ - 0
+ - 1
+ - 0
+ - 1
+ - 0
+ - 1
+ - 0
+ save: /share/project/lyt/ckpt
+ save_interval: 1000
+ expert_model_parallel_size: 4
+ logging:
+ log_interval: 1
+ tensorboard_log_interval: 1
+ wandb_exp_name: mixtral-8x7b
+ wandb_project: mixtral
+ pipeline_model_parallel_size: 4
+ precision:
+ bf16: true
+ sequence_parallel: true
+ tensor_model_parallel_size: 2
+ transformer_impl: transformer_engine
+ use_distributed_optimizer: true
+ use_mcore_models: true
diff --git a/megatron/megatron/core/dist_checkpointing/strategies/base.py b/megatron/megatron/core/dist_checkpointing/strategies/base.py
index cc1c83b9..43b6b95e 100644
--- a/megatron/megatron/core/dist_checkpointing/strategies/base.py
+++ b/megatron/megatron/core/dist_checkpointing/strategies/base.py
@@ -20,8 +20,8 @@ class StrategyAction(Enum):


_import_trigger = None
-default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict)
-
+#default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict)
+default_strategies = defaultdict(dict)
async_calls = AsyncCallsQueue()


diff --git a/megatron/megatron/core/distributed/param_and_grad_buffer.py b/megatron/megatron/core/distributed/param_and_grad_buffer.py
index 77ecd7be..fd2e68a4 100644
--- a/megatron/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/megatron/core/distributed/param_and_grad_buffer.py
@@ -2,6 +2,7 @@

import logging
import math
+import numpy
import os
from enum import Enum
from typing import Dict, List, Optional
@@ -257,7 +258,7 @@ class ParamAndGradBuffer:
# This also helps cuBLAS pick more efficient algorithms for GEMMs.
# We now ensure that all buckets start at a memory address that is 256-byte
# aligned (128 values since params and grads use >= 16-bit precision).
- return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128))
+ return _pad(bucket_end_index, numpy.lcm(self.data_parallel_world_size, 128))
return bucket_end_index

def _pad_start_of_param_if_needed(param_start_index: int) -> int:
diff --git a/megatron/megatron/legacy/fused_kernels/__init__.py b/megatron/megatron/legacy/fused_kernels/__init__.py
index 87cceac3..5a04def1 100644
--- a/megatron/megatron/legacy/fused_kernels/__init__.py
+++ b/megatron/megatron/legacy/fused_kernels/__init__.py
@@ -56,7 +56,7 @@ def load(args):

def _get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output(
- [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+ [cuda_dir + "/bin/cucc", "-V"], universal_newlines=True
)
output = raw_output.split()
release_idx = output.index("release") + 1
diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py
index 6e58b317..20878032 100644
--- a/megatron/megatron/training/checkpointing.py
+++ b/megatron/megatron/training/checkpointing.py
@@ -1059,11 +1059,11 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
# Model.
strict = False if args.retro_add_retriever else strict
if len(model) == 1:
- model[0].load_state_dict(state_dict['model'], strict=strict)
+ model[0].load_state_dict(state_dict['model'], strict=False)
else:
for i in range(len(model)):
mpu.set_virtual_pipeline_model_parallel_rank(i)
- model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
+ model[i].load_state_dict(state_dict['model%d' % i], strict=False)

# Fix up query/key/value matrix ordering if needed.
checkpoint_version = get_checkpoint_version()
--
2.34.1

0 comments on commit 405d753

Please sign in to comment.