forked from FlagOpen/FlagScale
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[metax] add mixtral-8x7b flagscale continue train patch (FlagOpen#228)
[metax] add mixtral-8x7b flagscale continue train patch Co-authored-by: sfwang <wangshunfei2010@126.com>
- Loading branch information
Showing
1 changed file
with
322 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,322 @@ | ||
From 0b008a5c8581dabee7a06533c2a926199d064546 Mon Sep 17 00:00:00 2001 | ||
From: sfwang <wangshunfei2010@126.com> | ||
Date: Sat, 5 Oct 2024 07:49:05 +0000 | ||
Subject: [PATCH] add mixtral-8x7b test case | ||
|
||
--- | ||
examples/mixtral/conf/config.yaml | 58 +++++-- | ||
.../conf/train/train_mixtral_8x7b.yaml | 154 +++++++++++------- | ||
.../dist_checkpointing/strategies/base.py | 4 +- | ||
.../core/distributed/param_and_grad_buffer.py | 3 +- | ||
.../megatron/legacy/fused_kernels/__init__.py | 2 +- | ||
megatron/megatron/training/checkpointing.py | 4 +- | ||
6 files changed, 139 insertions(+), 86 deletions(-) | ||
|
||
diff --git a/examples/mixtral/conf/config.yaml b/examples/mixtral/conf/config.yaml | ||
index 3e8c10f7..2a9e5fc6 100644 | ||
--- a/examples/mixtral/conf/config.yaml | ||
+++ b/examples/mixtral/conf/config.yaml | ||
@@ -1,24 +1,48 @@ | ||
+action: run | ||
defaults: | ||
- - _self_ | ||
- - train: train_mixtral_8x7b | ||
- | ||
+- _self_ | ||
+- train: train_mixtral_8x7b | ||
experiment: | ||
- exp_name: mixtral-8x7b | ||
- exp_dir: outputs | ||
- task: | ||
- type: train | ||
- backend: megatron | ||
- entrypoint: flagscale/train/train_mixtral.py | ||
- runner: | ||
- backend: torchrun | ||
- hostfile: <xxxx> | ||
+ cmds: | ||
+ before_start: '' | ||
envs: | ||
- CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" | ||
+ CUCC_PATH: /opt/maca/tools/cu-bridge | ||
CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
+ CUDA_PATH: /opt/maca/tools/cu-bridge | ||
+ CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
+ DEVINFO_ROOT: /opt/maca | ||
+ FORCE_ACTIVATE_WAIT: 1 | ||
+ LD_LIBRARY_PATH: /opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/mxdriver/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib | ||
+ MACA_CLANG: /opt/maca/mxgpu_llvm | ||
+ MACA_CLANG_PATH: /opt/maca/mxgpu_llvm/bin | ||
+ MACA_PATH: /opt/maca | ||
+ MACA_SMALL_PAGESIZE_ENABLE: 1 | ||
+ MAX_JOBS: 20 | ||
+ MCCL_IB_GID_INDEX: 1 | ||
+ MCCL_LIMIT_RING_LL_THREADTHRESHOLDS: 1 | ||
+ MCCL_MAX_NCHANNELS: 16 | ||
+ MCCL_NET_GDR_LEVEL: 7 | ||
+ MCCL_P2P_LEVEL: SYS | ||
+ MCPYTORCH_DISABLE_PRINT: 1 | ||
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
- | ||
-action: run | ||
- | ||
+ NVTE_APPLY_QK_LAYER_SCALING: 0 | ||
+ NVTE_FLASH_ATTN: 1 | ||
+ NVTE_FUSED_ATTN: 0 | ||
+ PATH: /opt/conda/bin:/opt/conda/condabin:/opt/maca/tools/cu-bridge:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/conda/bin:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin | ||
+ PYTORCH_ENABLE_SAME_RAND_A100: 1 | ||
+ SET_DEVICE_NUMA_PREFERRED: 1 | ||
+ exp_dir: /share/project/lyt/pr_tmp/FlagPerf/training/result/run20241005054039/mixtral_8x7B_continuetrain:flagscale:C500:4:8:1/round1/10.1.15.112_noderank0/outputs_mixtral | ||
+ exp_name: mixtral-8x7b | ||
+ runner: | ||
+ backend: torchrun | ||
+ hostfile: /share/project/mixtral_project/tmp_lyt/FlagScale/hostfile | ||
+ nnodes: 4 | ||
+ nproc_per_node: 8 | ||
+ ssh_port: 1234 | ||
+ task: | ||
+ backend: megatron | ||
+ entrypoint: flagscale/train/train_mixtral.py | ||
+ type: train | ||
hydra: | ||
run: | ||
- dir: ${experiment.exp_dir}/hydra | ||
+ dir: ${experiment.exp_dir}/hydra | ||
diff --git a/examples/mixtral/conf/train/train_mixtral_8x7b.yaml b/examples/mixtral/conf/train/train_mixtral_8x7b.yaml | ||
index 9332c5e9..ef1e3f8d 100644 | ||
--- a/examples/mixtral/conf/train/train_mixtral_8x7b.yaml | ||
+++ b/examples/mixtral/conf/train/train_mixtral_8x7b.yaml | ||
@@ -1,73 +1,101 @@ | ||
-system: | ||
- tensor_model_parallel_size: 2 | ||
- pipeline_model_parallel_size: 4 | ||
- expert_model_parallel_size: 4 | ||
- use_mcore_models: true | ||
- transformer_impl: transformer_engine | ||
- sequence_parallel: true | ||
- use_distributed_optimizer: true | ||
- precision: | ||
- bf16: true | ||
- logging: | ||
- log_interval: 1 | ||
- tensorboard_log_interval: 1 | ||
- wandb_project: mixtral | ||
- wandb_exp_name: mixtral-8x7b | ||
- checkpoint: | ||
- ckpt_format: torch | ||
- save_interval: 200 | ||
- | ||
- | ||
+data: | ||
+ data_path: /metax/dataset/SAMPLE50B/mixtral/mixtral_dataset/dedup-md5-pile-pile-cc_text_document | ||
+ split: 1 | ||
+ tokenizer: | ||
+ make_vocab_size_divisible_by: 64 | ||
+ tokenizer_path: /metax/dataset/SAMPLE50B/mixtral/mixtral_tokenizer | ||
+ tokenizer_type: QwenTokenizerFS | ||
model: | ||
- num_layers: 32 | ||
- hidden_size: 4096 | ||
+ attention_dropout: 0.0 | ||
+ clip_grad: 1.0 | ||
+ disable_bias_linear: true | ||
+ eval_interval: 1000 | ||
+ eval_iters: 10 | ||
ffn_hidden_size: 14336 | ||
- num_attention_heads: 32 | ||
- seq_length: 4096 | ||
- max_position_embeddings: 32768 | ||
- swiglu: true | ||
- normalization: RMSNorm | ||
- norm_epsilon: 1e-05 | ||
+ global_batch_size: 64 | ||
group_query_attention: true | ||
- num_query_groups: 8 | ||
- init_method_std: 0.02 | ||
- attention_dropout: 0.0 | ||
hidden_dropout: 0.0 | ||
- disable_bias_linear: true | ||
- position_embedding_type: rope | ||
- rotary_base: 1000000 | ||
- no_position_embedding: true | ||
- no_masked_softmax_fusion: true | ||
- untie_embeddings_and_output_weights: true | ||
- # moe args | ||
- num_experts: 8 | ||
+ hidden_size: 4096 | ||
+ init_method_std: 0.02 | ||
+ max_position_embeddings: 32768 | ||
+ micro_batch_size: 1 | ||
+ moe_aux_loss_coeff: 0.02 | ||
moe_router_load_balancing_type: aux_loss | ||
moe_router_topk: 2 | ||
- moe_aux_loss_coeff: 0.02 | ||
- # moe_grouped_gemm: true | ||
- | ||
- # seed: 42 | ||
- train_iters: 1000 | ||
- micro_batch_size: 1 | ||
- global_batch_size: 2048 | ||
- clip_grad: 1.0 | ||
- eval_interval: 1000 | ||
- eval_iters: 10 | ||
- | ||
+ no_masked_softmax_fusion: true | ||
+ no_position_embedding: true | ||
+ norm_epsilon: 1e-05 | ||
+ normalization: RMSNorm | ||
+ num_attention_heads: 32 | ||
+ num_experts: 8 | ||
+ num_layers: 32 | ||
+ num_query_groups: 8 | ||
optimizer: | ||
- lr: 1e-4 | ||
- weight_decay: 0.1 | ||
+ lr: 1.0e-06 | ||
lr_scheduler: | ||
- min_lr: 1.0e-5 | ||
- lr_warmup_iters: 500 | ||
lr_decay_iters: 320000 | ||
lr_decay_style: cosine | ||
- | ||
- | ||
-data: | ||
- data_path: <xxxx> | ||
- split: 1 | ||
- tokenizer: | ||
- tokenizer_type: QwenTokenizerFS | ||
- tokenizer_path: <xxxx> | ||
- make_vocab_size_divisible_by: 64 | ||
+ lr_warmup_iters: 5 | ||
+ min_lr: 1.0e-07 | ||
+ weight_decay: 0.1 | ||
+ position_embedding_type: rope | ||
+ rotary_base: 1000000 | ||
+ seq_length: 4096 | ||
+ swiglu: true | ||
+ train_iters: 10 | ||
+ untie_embeddings_and_output_weights: true | ||
+system: | ||
+ checkpoint: | ||
+ ckpt_format: torch | ||
+ finetune: true | ||
+ load: /metax/dataset/mixtral_tp2_pp4_ep4_latest | ||
+ no-load-optim: true | ||
+ no-load-rng: true | ||
+ no_save_optim: true | ||
+ no_save_rng: true | ||
+ recompute_granularity: full | ||
+ recompute_granularity_per_stage: | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ recompute_method: block | ||
+ recompute_method_per_stage: | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ - 1 | ||
+ recompute_num_layers: 0 | ||
+ recompute_num_layers_per_stage: | ||
+ - 1 | ||
+ - 0 | ||
+ - 1 | ||
+ - 0 | ||
+ - 1 | ||
+ - 0 | ||
+ - 1 | ||
+ - 0 | ||
+ save: /share/project/lyt/ckpt | ||
+ save_interval: 1000 | ||
+ expert_model_parallel_size: 4 | ||
+ logging: | ||
+ log_interval: 1 | ||
+ tensorboard_log_interval: 1 | ||
+ wandb_exp_name: mixtral-8x7b | ||
+ wandb_project: mixtral | ||
+ pipeline_model_parallel_size: 4 | ||
+ precision: | ||
+ bf16: true | ||
+ sequence_parallel: true | ||
+ tensor_model_parallel_size: 2 | ||
+ transformer_impl: transformer_engine | ||
+ use_distributed_optimizer: true | ||
+ use_mcore_models: true | ||
diff --git a/megatron/megatron/core/dist_checkpointing/strategies/base.py b/megatron/megatron/core/dist_checkpointing/strategies/base.py | ||
index cc1c83b9..43b6b95e 100644 | ||
--- a/megatron/megatron/core/dist_checkpointing/strategies/base.py | ||
+++ b/megatron/megatron/core/dist_checkpointing/strategies/base.py | ||
@@ -20,8 +20,8 @@ class StrategyAction(Enum): | ||
|
||
|
||
_import_trigger = None | ||
-default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict) | ||
- | ||
+#default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict) | ||
+default_strategies = defaultdict(dict) | ||
async_calls = AsyncCallsQueue() | ||
|
||
|
||
diff --git a/megatron/megatron/core/distributed/param_and_grad_buffer.py b/megatron/megatron/core/distributed/param_and_grad_buffer.py | ||
index 77ecd7be..fd2e68a4 100644 | ||
--- a/megatron/megatron/core/distributed/param_and_grad_buffer.py | ||
+++ b/megatron/megatron/core/distributed/param_and_grad_buffer.py | ||
@@ -2,6 +2,7 @@ | ||
|
||
import logging | ||
import math | ||
+import numpy | ||
import os | ||
from enum import Enum | ||
from typing import Dict, List, Optional | ||
@@ -257,7 +258,7 @@ class ParamAndGradBuffer: | ||
# This also helps cuBLAS pick more efficient algorithms for GEMMs. | ||
# We now ensure that all buckets start at a memory address that is 256-byte | ||
# aligned (128 values since params and grads use >= 16-bit precision). | ||
- return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128)) | ||
+ return _pad(bucket_end_index, numpy.lcm(self.data_parallel_world_size, 128)) | ||
return bucket_end_index | ||
|
||
def _pad_start_of_param_if_needed(param_start_index: int) -> int: | ||
diff --git a/megatron/megatron/legacy/fused_kernels/__init__.py b/megatron/megatron/legacy/fused_kernels/__init__.py | ||
index 87cceac3..5a04def1 100644 | ||
--- a/megatron/megatron/legacy/fused_kernels/__init__.py | ||
+++ b/megatron/megatron/legacy/fused_kernels/__init__.py | ||
@@ -56,7 +56,7 @@ def load(args): | ||
|
||
def _get_cuda_bare_metal_version(cuda_dir): | ||
raw_output = subprocess.check_output( | ||
- [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True | ||
+ [cuda_dir + "/bin/cucc", "-V"], universal_newlines=True | ||
) | ||
output = raw_output.split() | ||
release_idx = output.index("release") + 1 | ||
diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py | ||
index 6e58b317..20878032 100644 | ||
--- a/megatron/megatron/training/checkpointing.py | ||
+++ b/megatron/megatron/training/checkpointing.py | ||
@@ -1059,11 +1059,11 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri | ||
# Model. | ||
strict = False if args.retro_add_retriever else strict | ||
if len(model) == 1: | ||
- model[0].load_state_dict(state_dict['model'], strict=strict) | ||
+ model[0].load_state_dict(state_dict['model'], strict=False) | ||
else: | ||
for i in range(len(model)): | ||
mpu.set_virtual_pipeline_model_parallel_rank(i) | ||
- model[i].load_state_dict(state_dict['model%d' % i], strict=strict) | ||
+ model[i].load_state_dict(state_dict['model%d' % i], strict=False) | ||
|
||
# Fix up query/key/value matrix ordering if needed. | ||
checkpoint_version = get_checkpoint_version() | ||
-- | ||
2.34.1 |