Skip to content

Commit

Permalink
[metax] Add llama3-70B patch (FlagOpen#227)
Browse files Browse the repository at this point in the history
Add llama3-70B patch

---------

Co-authored-by: caozhou <48191911+Caozhou1995@users.noreply.github.com>
  • Loading branch information
suenphey and Caozhou1995 authored Oct 5, 2024
1 parent 44f7fdb commit 54fb18d
Showing 1 changed file with 208 additions and 0 deletions.
208 changes: 208 additions & 0 deletions hardware/metax_C500/a44556c0/a44556c0.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
From 85cdcccb9c352e78308ef0fefcba5d28d3bf81d9 Mon Sep 17 00:00:00 2001
From: sfwang <wangshunfei2010@126.com>
Date: Fri, 4 Oct 2024 11:27:51 +0800
Subject: [PATCH] [metax] support llama3-70b

---
examples/llama/conf/config.yaml | 88 +++++++++----------
.../conf/train/train_llama3_70b_finetune.yaml | 4 +-
.../core/distributed/param_and_grad_buffer.py | 3 +-
megatron/megatron/core/optimizer/__init__.py | 10 +++
.../core/optimizer/distrib_optimizer.py | 10 +++
.../megatron/legacy/fused_kernels/__init__.py | 2 +-
6 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml
index 592c45bf..42a487be 100644
--- a/examples/llama/conf/config.yaml
+++ b/examples/llama/conf/config.yaml
@@ -1,53 +1,49 @@
-# defaults:
-# - train: train_llama2_7b
-# - _self_
-
-# experiment:
-# exp_name: llama2
-# exp_dir: ./outputs_llama2
-# task:
-# type: train
-# backend: megatron
-# entrypoint: ./flagscale/train/train_llama.py
-# runner:
-# backend: torchrun
-# nnodes: 1
-# nproc_per_node: 8
-# hostfile: hostfile
-# envs:
-# CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
-# CUDA_DEVICE_MAX_CONNECTIONS: 1
-
-# action: run
-
-# hydra:
-# run:
-# dir: ${experiment.exp_dir}/hydra
-
-
+action: run
defaults:
- - train: train_llama3_70b
- - _self_
-
+- train: train_llama3_70b_finetune
+- _self_
experiment:
+ cmds:
+ before_start: ''
+ envs:
+ CUCC_PATH: /opt/maca/tools/cu-bridge
+ CUDA_DEVICE_MAX_CONNECTIONS: 1
+ CUDA_PATH: /opt/maca/tools/cu-bridge
+ CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+ DEVINFO_ROOT: /opt/maca
+ FORCE_ACTIVATE_WAIT: 1
+ LD_LIBRARY_PATH: /opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/mxdriver/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib
+ MACA_CLANG: /opt/maca/mxgpu_llvm
+ MACA_CLANG_PATH: /opt/maca/mxgpu_llvm/bin
+ MACA_PATH: /opt/maca
+ MACA_SMALL_PAGESIZE_ENABLE: 1
+ MAX_JOBS: 20
+ MCBLAS_CUSTOMIZED_CONFIG_PATH: /share/project/mcblas_yaml/mcblas_customized_config.yaml
+ MCCL_IB_GID_INDEX: 1
+ MCCL_LIMIT_RING_LL_THREADTHRESHOLDS: 1
+ MCCL_MAX_NCHANNELS: 16
+ MCCL_NET_GDR_LEVEL: 7
+ MCCL_P2P_LEVEL: SYS
+ MCPYTORCH_DISABLE_PRINT: 1
+ NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+ NVTE_APPLY_QK_LAYER_SCALING: 0
+ NVTE_FLASH_ATTN: 1
+ NVTE_FUSED_ATTN: 0
+ PATH: /opt/conda/bin:/opt/conda/condabin:/opt/maca/tools/cu-bridge:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/conda/bin:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ PYTORCH_ENABLE_SAME_RAND_A100: 1
+ SET_DEVICE_NUMA_PREFERRED: 1
+ exp_dir: ./outputs_llama3
exp_name: llama3
- exp_dir: ./outputs_llama3_70b
+ runner:
+ backend: torchrun
+ hostfile: hostfile
+ nnodes: 4
+ nproc_per_node: 8
+ ssh_port: 1234
task:
- type: train
backend: megatron
entrypoint: ./flagscale/train/train_llama.py
- runner:
- backend: torchrun
- nnodes: 4
- nproc_per_node: 8
- hostfile: ${hostfile??}
- envs:
- CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
- CUDA_DEVICE_MAX_CONNECTIONS: 1
- NVTE_APPLY_QK_LAYER_SCALING: 0
- NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
-action: run
-
-hydra:
+ type: train
+hydra:
run:
- dir: ${experiment.exp_dir}/hydra
+ dir: ${experiment.exp_dir}/hydra
\ No newline at end of file
diff --git a/examples/llama/conf/train/train_llama3_70b_finetune.yaml b/examples/llama/conf/train/train_llama3_70b_finetune.yaml
index 39fc4cbb..a30a401c 100644
--- a/examples/llama/conf/train/train_llama3_70b_finetune.yaml
+++ b/examples/llama/conf/train/train_llama3_70b_finetune.yaml
@@ -20,7 +20,9 @@ system:
checkpoint:
load: ${ckpt_path:??}
ckpt_format: torch
- save_interval: 100
+ no_save_optim: true
+ no_save_rng: true
+ save_interval: 500
finetune: True

model:
diff --git a/megatron/megatron/core/distributed/param_and_grad_buffer.py b/megatron/megatron/core/distributed/param_and_grad_buffer.py
index 65c8eeb1..37d2fc3a 100644
--- a/megatron/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/megatron/core/distributed/param_and_grad_buffer.py
@@ -2,6 +2,7 @@

import logging
import math
+import numpy
import os
from enum import Enum
from typing import Dict, List, Optional
@@ -253,7 +254,7 @@ class ParamAndGradBuffer:
# This also helps cuBLAS pick more efficient algorithms for GEMMs.
# We now ensure that all buckets start at a memory address that is 256-byte
# aligned (128 values since params and grads use >= 16-bit precision).
- return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128))
+ return _pad(bucket_end_index, numpy.lcm(self.data_parallel_world_size, 128))
return bucket_end_index

def _pad_start_of_param_if_needed(param_start_index: int) -> int:
diff --git a/megatron/megatron/core/optimizer/__init__.py b/megatron/megatron/core/optimizer/__init__.py
index ad3777b1..01f83b6b 100644
--- a/megatron/megatron/core/optimizer/__init__.py
+++ b/megatron/megatron/core/optimizer/__init__.py
@@ -22,6 +22,16 @@ except ImportError:
## see https://github.com/NVIDIA/apex/blob/7b73b12361068a10b0f44844534613f252a5ea75/apex/optimizers/fused_adam.py#L16
from torch.optim import AdamW as Adam, SGD

+try:
+ import sys
+ so_dir_path = "/share/project/qdam"
+ sys.path.append(so_dir_path)
+ from qadam import FusedAdam as Adam
+except ImportError:
+ warnings.warn(
+ f'qadam are not installed. Falling back.'
+ )
+
from megatron.core import mpu

from ..distributed import ParamAndGradBuffer
diff --git a/megatron/megatron/core/optimizer/distrib_optimizer.py b/megatron/megatron/core/optimizer/distrib_optimizer.py
index b42b493f..0a0acf44 100644
--- a/megatron/megatron/core/optimizer/distrib_optimizer.py
+++ b/megatron/megatron/core/optimizer/distrib_optimizer.py
@@ -21,6 +21,16 @@ except ImportError:

HAVE_APEX_OR_TE = False

+try:
+ import sys
+ so_dir_path = "/share/project/qdam"
+ sys.path.append(so_dir_path)
+ from qadam import FusedAdam as Adam
+except ImportError:
+ warnings.warn(
+ f'qadam are not installed. Falling back.'
+ )
+
from .. import parallel_state, tensor_parallel
from ..config_logger import has_config_logger_enabled, log_config_to_disk
from ..dist_checkpointing import ShardedTensor
diff --git a/megatron/megatron/legacy/fused_kernels/__init__.py b/megatron/megatron/legacy/fused_kernels/__init__.py
index 87cceac3..5a04def1 100644
--- a/megatron/megatron/legacy/fused_kernels/__init__.py
+++ b/megatron/megatron/legacy/fused_kernels/__init__.py
@@ -56,7 +56,7 @@ def load(args):

def _get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output(
- [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+ [cuda_dir + "/bin/cucc", "-V"], universal_newlines=True
)
output = raw_output.split()
release_idx = output.index("release") + 1
--
2.25.1

0 comments on commit 54fb18d

Please sign in to comment.