Skip to content

Commit

Permalink
feat(tests): update ci e2e tests
Browse files Browse the repository at this point in the history
  • Loading branch information
huangting4201 committed Feb 5, 2024
1 parent 99ee863 commit a73e51b
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 22 deletions.
18 changes: 18 additions & 0 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,21 @@ jobs:
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
training_8GPU_ISP_CKPT:
runs-on: [t_cluster]
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU_ISP_CKPT
run: |
source $evo_env
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
48 changes: 41 additions & 7 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- name: training_8GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
training_16GPU_8DP2TP:
runs-on: [t_cluster]
Expand All @@ -40,7 +40,7 @@ jobs:
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training/test_loss.py
training_16GPU_8DP2TPSP:
runs-on: [t_cluster]
Expand All @@ -57,9 +57,8 @@ jobs:
- name: training_16GPU_8DP2TPSP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
sed -i 's/^.*sequence_parallel=.*/ sequence_parallel=True,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training/test_loss.py
training_16GPU_8DP2PP:
runs-on: [t_cluster]
Expand All @@ -77,7 +76,7 @@ jobs:
run: |
source $evo_env
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training/test_loss.py
training_16GPU_8DP2PP_InterleavedOverlap:
runs-on: [t_cluster]
Expand All @@ -96,7 +95,42 @@ jobs:
source $evo_env
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
training_8GPU_ISP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU_ISP
run: |
source $evo_env
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
training_8GPU_ISP_CKPT:
runs-on: [t_cluster]
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU_ISP_CKPT
run: |
source $evo_env
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
unit_test_optimizer:
runs-on: [t_cluster]
Expand Down
3 changes: 2 additions & 1 deletion internlm/utils/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,8 @@ def save_model_checkpoint(folder, model):
pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
wdp_rank = gpc.get_local_rank(ParallelMode.WEIGHT_DATA)

should_save_rank_pair = set() # (tp_rank, dp_rank)

# TODO In theory, we should also consider pp level, but since pp is generally a state across machines,
# even if pp is not considered, it will definitely not be written on the same machine.

Expand All @@ -336,7 +338,6 @@ def save_model_checkpoint(folder, model):
llm_save(topo_fp, saved_obj=topo)
else:
# for tensor parallel mode with mtp/msp/fsp
should_save_rank_pair = set() # (tp_rank, dp_rank)
for i in range(tp_size):
if gpc.config.parallel.zero1.fsdp:
for j in range(dp_size):
Expand Down
58 changes: 44 additions & 14 deletions tests/test_training/test_loss.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import math
import os
import subprocess
import shutil

import pytest
import torch
Expand All @@ -12,7 +13,7 @@
from internlm.core.trainer import TrainState
from internlm.initialize import initialize_distributed_env
from internlm.model.loss import FlashGPTLMLoss
from internlm.model.metrics import AccPerplex, SchedulerMetricHook
from internlm.model.metrics import AccPerplex
from internlm.train import (
get_train_data_loader,
initialize_model,
Expand Down Expand Up @@ -53,6 +54,7 @@ def train(
num_chunks: int = 2,
interleaved: bool = False,
enable_sp: bool = False,
enable_ckpt: bool = False,
):
# initialize distributed environment
initialize_distributed_env(config=CONFIG_FILE_PATH)
Expand Down Expand Up @@ -92,6 +94,14 @@ def train(
skip_batches = gpc.config.data.skip_batches
label_smoothing = gpc.config.loss.label_smoothing

# update ckpt config
if enable_ckpt:
gpc.config.ckpt.enable_save_ckpt = True
gpc.config.ckpt.checkpoint_every = 5
gpc.config.ckpt.save_ckpt_folder = "local:llm_ckpts/"
gpc.config.ckpt.load_ckpt_info["content"] = ("all",)
gpc.config.ckpt.oss_snapshot_freq = 100

# get and broadcast current time
current_time = launch_time()
objs = [current_time]
Expand Down Expand Up @@ -139,19 +149,6 @@ def train(
dataset_types=dataset_types,
)

# initialize trainer
scheduler_hooks = [
SchedulerMetricHook(
metric=metric,
skip=(
gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
and hasattr(gpc.config.model, "num_chunks")
and gpc.config.model.num_chunks > 1
and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
),
),
]

# initialize trainer
trainer, train_dl, _, _ = internlm.initialize_trainer(
model=model,
Expand Down Expand Up @@ -213,6 +210,9 @@ def train(
cur_loss_list.append((loss.item() - moe_loss.item() if moe_loss is not None else loss.item()))
timer("fwd-bwd").stop()

if isp_communicator and isp_communicator.enable_memory_pool:
isp_communicator.memory_pool.reset_lazy_pools()

# update parameters, and returns (success_update, grad_norm)
trainer_result = trainer.step()
assert trainer_result is not None
Expand All @@ -226,6 +226,14 @@ def train(

timer("one-batch").stop()

# checkpoint the training states in specific steps, which is determined by the args "checkpoint_every"
# # save batch sampler that tracks the true consumed samples
now_break = ckpt_manager.try_save_checkpoint(train_state)
if now_break:
break

ckpt_manager.wait_async_upload_finish()


def check_loss_spike():
if gpc.is_rank_for_log():
Expand Down Expand Up @@ -311,3 +319,25 @@ def test_training_with_isp():

# model training
train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True)


@pytest.mark.training_8GPU_ISP_SAVE_CKPT
def test_training_with_isp_save_ckpt():
# update config file
global CONFIG_FILE_PATH
CONFIG_FILE_PATH = "./configs/7B_isp_sft.py"

# model training save ckpt
train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True, enable_ckpt=True)


@pytest.mark.training_8GPU_ISP_LOAD_CKPT
def test_training_with_isp_load_ckpt():
# update config file
global CONFIG_FILE_PATH
CONFIG_FILE_PATH = "./configs/7B_isp_sft.py"

shutil.rmtree("./llm_ckpts/10")

# model training load ckpt
train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True, enable_ckpt=True)

0 comments on commit a73e51b

Please sign in to comment.