Skip to content

Commit

Permalink
fix check CUDA_DEVICE_MAX_CONNECTIONS
Browse files Browse the repository at this point in the history
  • Loading branch information
sallyjunjun committed Dec 18, 2024
1 parent 141e9eb commit f763177
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 8 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/demo_in_readme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,25 +45,33 @@ jobs:
id: basic_train
run: |
source activate ${evo_env_torch21_flash2}
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_new_ckpt
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
- name: torchrun-train
run: |
source activate ${evo_env_torch21_flash2}
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
Expand Down
29 changes: 28 additions & 1 deletion internlm/core/trainer_builder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gc
import logging
import os
import time
from functools import partial
from typing import Dict, List, Optional, Union
Expand All @@ -8,6 +9,7 @@
import torch.distributed as dist
from torch.utils.data import DataLoader

from internlm.accelerator import AcceleratorType, get_accelerator
from internlm.checkpoint.checkpoint_manager import CheckpointManager
from internlm.core.context import global_context as gpc
from internlm.core.context.process_group_initializer import ParallelMode
Expand All @@ -31,7 +33,6 @@
)
from internlm.utils.common import (
BatchSkipper,
check_cuda_env,
enable_pytorch_expandable_segments,
get_current_device,
get_megatron_flops,
Expand All @@ -47,6 +48,32 @@

# global llm logger
logger = logging.getLogger(__file__)
internlm_accelerator = get_accelerator()


def check_cuda_env():
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
wp_fwd_per = gpc.config.parallel.weight.get("forward_overlap_per", "layer")
ewp_fwd_per = gpc.config.parallel.expert_weight.get("forward_overlap_per", "layer")
wp_size = gpc.config.parallel.weight.get("size", 1)
ewp_size = gpc.config.parallel.expert_weight.get("size", 1)
open_max_conns = (wp_size == 1 or wp_fwd_per != "layer") and (ewp_size == 1 or ewp_fwd_per != "layer")
if open_max_conns:
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
assert (
max_connections is not None
), "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
assert (
max_connections == "1"
), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)

avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
assert (
avoid_record_streams is not None
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
assert (
avoid_record_streams == "1"
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)


class TrainerBuilder(Trainer):
Expand Down
4 changes: 2 additions & 2 deletions internlm/data/tokenized/dummy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from torch.utils.data import Dataset

# from internlm.core.context.parallel_context import global_context as gpc
from internlm.core.context.parallel_context import global_context as gpc


class RandomDataset(Dataset):
Expand All @@ -30,7 +30,7 @@ def __init__(self, num_samples=10000, max_len=1024, fixed_seqlen: bool = False)
while len(d) < max_len:
r *= 2
d = list(range(n)) * r
# r = r % gpc.config.model.vocab_size
r = r % gpc.config.model.vocab_size
d = [n, r] + d
d = d[:max_len]
data.append(d)
Expand Down
5 changes: 0 additions & 5 deletions internlm/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,6 @@ def enable_pytorch_expandable_segments():
logger.warning("To support the 'expandable_segments' configuration, please upgrade torch to version 2.1.0.")


def check_cuda_env():
if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")


class DummyProfile:
"""
Dummy Profile.
Expand Down
1 change: 1 addition & 0 deletions tests/test_data/test_batch_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def do_warmup(args):
rank, worldsize, init_config, should_sccuess, answer = args
build_environment(rank, worldsize, init_config)
gpc.config.model.num_chunks = 1 if gpc.get_world_size(ParallelMode.PIPELINE) == 1 else 2
gpc.config.model.vocab_size = 92544
engine, scheduler = init_model_and_optim(
8,
gpc.config.model.num_chunks,
Expand Down

0 comments on commit f763177

Please sign in to comment.