Skip to content

Commit

Permalink
feat(tests): update ci e2e tests (#45)
Browse files Browse the repository at this point in the history
  • Loading branch information
huangting4201 authored Feb 19, 2024
1 parent 1c3b892 commit be32910
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 73 deletions.
26 changes: 20 additions & 6 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ env:
SLURM_PARTITION: llm_s

jobs:
training_8GPU:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -20,10 +20,10 @@ jobs:
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU
- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
training_8GPU_ISP:
runs-on: [t_cluster]
Expand All @@ -37,7 +37,21 @@ jobs:

- name: training_8GPU_ISP
run: |
source $evo_env
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
training_8GPU_ISP_CKPT:
runs-on: [t_cluster]
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU_ISP_CKPT
run: |
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
136 changes: 114 additions & 22 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ env:
SLURM_PARTITION: llm_s

jobs:
training_8GPU:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -19,12 +19,12 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_8GPU
- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
training_16GPU_8DP2TP:
training_8GPU_4DP2TP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -36,13 +36,13 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2TP
- name: training_8GPU_4DP2TP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
training_16GPU_8DP2TPSP:
training_8GPU_4DP2TPSP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -54,14 +54,13 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2TPSP
- name: training_8GPU_4DP2TPSP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
sed -i 's/^.*sequence_parallel=.*/ sequence_parallel=True,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
training_16GPU_8DP2PP:
training_8GPU_4DP2PP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -73,13 +72,13 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2PP
- name: training_8GPU_4DP2PP
run: |
source $evo_env
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
training_16GPU_8DP2PP_InterleavedOverlap:
training_8GPU_4DP2PP_InterleavedOverlap:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -91,12 +90,100 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2PP_InterleavedOverlap
- name: training_8GPU_4DP2PP_InterleavedOverlap
run: |
source $evo_env
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
training_16GPU_4DP2TP2PP_MTP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_4DP2TP2PP_MTP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
training_16GPU_4DP2TP2PP_MSP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_4DP2TP2PP_MSP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
training_16GPU_4DP2TP2PP_FSP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_4DP2TP2PP_FSP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
training_8GPU_ISP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU_ISP
run: |
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
training_8GPU_ISP_CKPT:
runs-on: [t_cluster]
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU_ISP_CKPT
run: |
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
unit_test_optimizer:
runs-on: [t_cluster]
Expand Down Expand Up @@ -162,11 +249,16 @@ jobs:
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [
training_8GPU,
training_16GPU_8DP2TP,
training_16GPU_8DP2TPSP,
training_16GPU_8DP2PP,
training_16GPU_8DP2PP_InterleavedOverlap,
training_4GPU,
training_8GPU_4DP2TP,
training_8GPU_4DP2TPSP,
training_8GPU_4DP2PP,
training_8GPU_4DP2PP_InterleavedOverlap,
training_16GPU_4DP2TP2PP_MTP,
training_16GPU_4DP2TP2PP_MSP,
training_16GPU_4DP2TP2PP_FSP,
training_8GPU_ISP,
training_8GPU_ISP_CKPT,
unit_test_optimizer,
unit_test_model,
load_ckpt_then_assert_loss
Expand Down
2 changes: 1 addition & 1 deletion configs/7B_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@
3. memory_pool: bool, enable/disable memory pool, defaults to False.
"""
parallel = dict(
zero1=dict(size=8),
zero1=dict(size=-1),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True, memory_pool=True),
Expand Down
3 changes: 2 additions & 1 deletion internlm/utils/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,8 @@ def save_model_checkpoint(folder, model):
pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
wdp_rank = gpc.get_local_rank(ParallelMode.WEIGHT_DATA)

should_save_rank_pair = set() # (tp_rank, dp_rank)

# TODO In theory, we should also consider pp level, but since pp is generally a state across machines,
# even if pp is not considered, it will definitely not be written on the same machine.

Expand All @@ -336,7 +338,6 @@ def save_model_checkpoint(folder, model):
llm_save(topo_fp, saved_obj=topo)
else:
# for tensor parallel mode with mtp/msp/fsp
should_save_rank_pair = set() # (tp_rank, dp_rank)
for i in range(tp_size):
if gpc.config.parallel.zero1.fsdp:
for j in range(dp_size):
Expand Down
3 changes: 1 addition & 2 deletions tests/test_training/test_forward_output_no_fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.core.context.parallel_context import Config
from internlm.core.scheduler import SchedulerMetricHook
from internlm.initialize.launch import args_sanity_check
from internlm.model.loss import FlashGPTLMLoss
from internlm.model.metrics import AccPerplex
from internlm.model.metrics import AccPerplex, SchedulerMetricHook
from internlm.train import get_train_data_loader, initialize_model, initialize_optimizer
from internlm.utils.logger import get_logger

Expand Down
Loading

0 comments on commit be32910

Please sign in to comment.