forked from FlagOpen/FlagScale
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[metax] Add llama3-70B patch (FlagOpen#227)
Add llama3-70B patch --------- Co-authored-by: caozhou <48191911+Caozhou1995@users.noreply.github.com>
- Loading branch information
1 parent
44f7fdb
commit 54fb18d
Showing
1 changed file
with
208 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
From 85cdcccb9c352e78308ef0fefcba5d28d3bf81d9 Mon Sep 17 00:00:00 2001 | ||
From: sfwang <wangshunfei2010@126.com> | ||
Date: Fri, 4 Oct 2024 11:27:51 +0800 | ||
Subject: [PATCH] [metax] support llama3-70b | ||
|
||
--- | ||
examples/llama/conf/config.yaml | 88 +++++++++---------- | ||
.../conf/train/train_llama3_70b_finetune.yaml | 4 +- | ||
.../core/distributed/param_and_grad_buffer.py | 3 +- | ||
megatron/megatron/core/optimizer/__init__.py | 10 +++ | ||
.../core/optimizer/distrib_optimizer.py | 10 +++ | ||
.../megatron/legacy/fused_kernels/__init__.py | 2 +- | ||
6 files changed, 68 insertions(+), 49 deletions(-) | ||
|
||
diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml | ||
index 592c45bf..42a487be 100644 | ||
--- a/examples/llama/conf/config.yaml | ||
+++ b/examples/llama/conf/config.yaml | ||
@@ -1,53 +1,49 @@ | ||
-# defaults: | ||
-# - train: train_llama2_7b | ||
-# - _self_ | ||
- | ||
-# experiment: | ||
-# exp_name: llama2 | ||
-# exp_dir: ./outputs_llama2 | ||
-# task: | ||
-# type: train | ||
-# backend: megatron | ||
-# entrypoint: ./flagscale/train/train_llama.py | ||
-# runner: | ||
-# backend: torchrun | ||
-# nnodes: 1 | ||
-# nproc_per_node: 8 | ||
-# hostfile: hostfile | ||
-# envs: | ||
-# CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
-# CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
- | ||
-# action: run | ||
- | ||
-# hydra: | ||
-# run: | ||
-# dir: ${experiment.exp_dir}/hydra | ||
- | ||
- | ||
+action: run | ||
defaults: | ||
- - train: train_llama3_70b | ||
- - _self_ | ||
- | ||
+- train: train_llama3_70b_finetune | ||
+- _self_ | ||
experiment: | ||
+ cmds: | ||
+ before_start: '' | ||
+ envs: | ||
+ CUCC_PATH: /opt/maca/tools/cu-bridge | ||
+ CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
+ CUDA_PATH: /opt/maca/tools/cu-bridge | ||
+ CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
+ DEVINFO_ROOT: /opt/maca | ||
+ FORCE_ACTIVATE_WAIT: 1 | ||
+ LD_LIBRARY_PATH: /opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/mxdriver/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib | ||
+ MACA_CLANG: /opt/maca/mxgpu_llvm | ||
+ MACA_CLANG_PATH: /opt/maca/mxgpu_llvm/bin | ||
+ MACA_PATH: /opt/maca | ||
+ MACA_SMALL_PAGESIZE_ENABLE: 1 | ||
+ MAX_JOBS: 20 | ||
+ MCBLAS_CUSTOMIZED_CONFIG_PATH: /share/project/mcblas_yaml/mcblas_customized_config.yaml | ||
+ MCCL_IB_GID_INDEX: 1 | ||
+ MCCL_LIMIT_RING_LL_THREADTHRESHOLDS: 1 | ||
+ MCCL_MAX_NCHANNELS: 16 | ||
+ MCCL_NET_GDR_LEVEL: 7 | ||
+ MCCL_P2P_LEVEL: SYS | ||
+ MCPYTORCH_DISABLE_PRINT: 1 | ||
+ NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
+ NVTE_APPLY_QK_LAYER_SCALING: 0 | ||
+ NVTE_FLASH_ATTN: 1 | ||
+ NVTE_FUSED_ATTN: 0 | ||
+ PATH: /opt/conda/bin:/opt/conda/condabin:/opt/maca/tools/cu-bridge:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/conda/bin:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin | ||
+ PYTORCH_ENABLE_SAME_RAND_A100: 1 | ||
+ SET_DEVICE_NUMA_PREFERRED: 1 | ||
+ exp_dir: ./outputs_llama3 | ||
exp_name: llama3 | ||
- exp_dir: ./outputs_llama3_70b | ||
+ runner: | ||
+ backend: torchrun | ||
+ hostfile: hostfile | ||
+ nnodes: 4 | ||
+ nproc_per_node: 8 | ||
+ ssh_port: 1234 | ||
task: | ||
- type: train | ||
backend: megatron | ||
entrypoint: ./flagscale/train/train_llama.py | ||
- runner: | ||
- backend: torchrun | ||
- nnodes: 4 | ||
- nproc_per_node: 8 | ||
- hostfile: ${hostfile??} | ||
- envs: | ||
- CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
- CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
- NVTE_APPLY_QK_LAYER_SCALING: 0 | ||
- NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 | ||
-action: run | ||
- | ||
-hydra: | ||
+ type: train | ||
+hydra: | ||
run: | ||
- dir: ${experiment.exp_dir}/hydra | ||
+ dir: ${experiment.exp_dir}/hydra | ||
\ No newline at end of file | ||
diff --git a/examples/llama/conf/train/train_llama3_70b_finetune.yaml b/examples/llama/conf/train/train_llama3_70b_finetune.yaml | ||
index 39fc4cbb..a30a401c 100644 | ||
--- a/examples/llama/conf/train/train_llama3_70b_finetune.yaml | ||
+++ b/examples/llama/conf/train/train_llama3_70b_finetune.yaml | ||
@@ -20,7 +20,9 @@ system: | ||
checkpoint: | ||
load: ${ckpt_path:??} | ||
ckpt_format: torch | ||
- save_interval: 100 | ||
+ no_save_optim: true | ||
+ no_save_rng: true | ||
+ save_interval: 500 | ||
finetune: True | ||
|
||
model: | ||
diff --git a/megatron/megatron/core/distributed/param_and_grad_buffer.py b/megatron/megatron/core/distributed/param_and_grad_buffer.py | ||
index 65c8eeb1..37d2fc3a 100644 | ||
--- a/megatron/megatron/core/distributed/param_and_grad_buffer.py | ||
+++ b/megatron/megatron/core/distributed/param_and_grad_buffer.py | ||
@@ -2,6 +2,7 @@ | ||
|
||
import logging | ||
import math | ||
+import numpy | ||
import os | ||
from enum import Enum | ||
from typing import Dict, List, Optional | ||
@@ -253,7 +254,7 @@ class ParamAndGradBuffer: | ||
# This also helps cuBLAS pick more efficient algorithms for GEMMs. | ||
# We now ensure that all buckets start at a memory address that is 256-byte | ||
# aligned (128 values since params and grads use >= 16-bit precision). | ||
- return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128)) | ||
+ return _pad(bucket_end_index, numpy.lcm(self.data_parallel_world_size, 128)) | ||
return bucket_end_index | ||
|
||
def _pad_start_of_param_if_needed(param_start_index: int) -> int: | ||
diff --git a/megatron/megatron/core/optimizer/__init__.py b/megatron/megatron/core/optimizer/__init__.py | ||
index ad3777b1..01f83b6b 100644 | ||
--- a/megatron/megatron/core/optimizer/__init__.py | ||
+++ b/megatron/megatron/core/optimizer/__init__.py | ||
@@ -22,6 +22,16 @@ except ImportError: | ||
## see https://github.com/NVIDIA/apex/blob/7b73b12361068a10b0f44844534613f252a5ea75/apex/optimizers/fused_adam.py#L16 | ||
from torch.optim import AdamW as Adam, SGD | ||
|
||
+try: | ||
+ import sys | ||
+ so_dir_path = "/share/project/qdam" | ||
+ sys.path.append(so_dir_path) | ||
+ from qadam import FusedAdam as Adam | ||
+except ImportError: | ||
+ warnings.warn( | ||
+ f'qadam are not installed. Falling back.' | ||
+ ) | ||
+ | ||
from megatron.core import mpu | ||
|
||
from ..distributed import ParamAndGradBuffer | ||
diff --git a/megatron/megatron/core/optimizer/distrib_optimizer.py b/megatron/megatron/core/optimizer/distrib_optimizer.py | ||
index b42b493f..0a0acf44 100644 | ||
--- a/megatron/megatron/core/optimizer/distrib_optimizer.py | ||
+++ b/megatron/megatron/core/optimizer/distrib_optimizer.py | ||
@@ -21,6 +21,16 @@ except ImportError: | ||
|
||
HAVE_APEX_OR_TE = False | ||
|
||
+try: | ||
+ import sys | ||
+ so_dir_path = "/share/project/qdam" | ||
+ sys.path.append(so_dir_path) | ||
+ from qadam import FusedAdam as Adam | ||
+except ImportError: | ||
+ warnings.warn( | ||
+ f'qadam are not installed. Falling back.' | ||
+ ) | ||
+ | ||
from .. import parallel_state, tensor_parallel | ||
from ..config_logger import has_config_logger_enabled, log_config_to_disk | ||
from ..dist_checkpointing import ShardedTensor | ||
diff --git a/megatron/megatron/legacy/fused_kernels/__init__.py b/megatron/megatron/legacy/fused_kernels/__init__.py | ||
index 87cceac3..5a04def1 100644 | ||
--- a/megatron/megatron/legacy/fused_kernels/__init__.py | ||
+++ b/megatron/megatron/legacy/fused_kernels/__init__.py | ||
@@ -56,7 +56,7 @@ def load(args): | ||
|
||
def _get_cuda_bare_metal_version(cuda_dir): | ||
raw_output = subprocess.check_output( | ||
- [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True | ||
+ [cuda_dir + "/bin/cucc", "-V"], universal_newlines=True | ||
) | ||
output = raw_output.split() | ||
release_idx = output.index("release") + 1 | ||
-- | ||
2.25.1 |