feat(tests): update ci e2e tests

InternLM · Feb 5, 2024 · a73e51b · a73e51b
1 parent 99ee863
commit a73e51b
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 22 deletions.
diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
@@ -41,3 +41,21 @@ jobs:
         conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
         conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
+
+  training_8GPU_ISP_CKPT:
+    runs-on: [t_cluster]
+    timeout-minutes: 20
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU_ISP_CKPT
+      run: |
+        source $evo_env
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
@@ -22,7 +22,7 @@ jobs:
     - name: training_8GPU
       run: |
         source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
 
   training_16GPU_8DP2TP:
     runs-on: [t_cluster]
@@ -40,7 +40,7 @@ jobs:
       run: |
         source $evo_env
         sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training/test_loss.py
 
   training_16GPU_8DP2TPSP:
     runs-on: [t_cluster]
@@ -57,9 +57,8 @@ jobs:
     - name: training_16GPU_8DP2TPSP
       run: |
         source $evo_env
-        sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
-        sed -i 's/^.*sequence_parallel=.*/    sequence_parallel=True,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training/test_loss.py
             
   training_16GPU_8DP2PP:
     runs-on: [t_cluster]
@@ -77,7 +76,7 @@ jobs:
       run: |
         source $evo_env
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training/test_loss.py
 
   training_16GPU_8DP2PP_InterleavedOverlap:
     runs-on: [t_cluster]
@@ -96,7 +95,42 @@ jobs:
         source $evo_env
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
         sed -i 's/^.*num_chunks=.*/    num_chunks=2,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
+
+  training_8GPU_ISP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU_ISP
+      run: |
+        source $evo_env
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
+
+  training_8GPU_ISP_CKPT:
+    runs-on: [t_cluster]
+    timeout-minutes: 20
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU_ISP_CKPT
+      run: |
+        source $evo_env
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
 
   unit_test_optimizer:
     runs-on: [t_cluster]

diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
@@ -322,6 +322,8 @@ def save_model_checkpoint(folder, model):
         pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
         wdp_rank = gpc.get_local_rank(ParallelMode.WEIGHT_DATA)
 
+        should_save_rank_pair = set()  # (tp_rank, dp_rank)
+
         # TODO In theory, we should also consider pp level, but since pp is generally a state across machines,
         # even if pp is not considered, it will definitely not be written on the same machine.
 
@@ -336,7 +338,6 @@ def save_model_checkpoint(folder, model):
                 llm_save(topo_fp, saved_obj=topo)
         else:
             # for tensor parallel mode with mtp/msp/fsp
-            should_save_rank_pair = set()  # (tp_rank, dp_rank)
             for i in range(tp_size):
                 if gpc.config.parallel.zero1.fsdp:
                     for j in range(dp_size):

diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
@@ -1,6 +1,7 @@
 import math
 import os
 import subprocess
+import shutil
 
 import pytest
 import torch
@@ -12,7 +13,7 @@
 from internlm.core.trainer import TrainState
 from internlm.initialize import initialize_distributed_env
 from internlm.model.loss import FlashGPTLMLoss
-from internlm.model.metrics import AccPerplex, SchedulerMetricHook
+from internlm.model.metrics import AccPerplex
 from internlm.train import (
     get_train_data_loader,
     initialize_model,
@@ -53,6 +54,7 @@ def train(
     num_chunks: int = 2,
     interleaved: bool = False,
     enable_sp: bool = False,
+    enable_ckpt: bool = False,
 ):
     # initialize distributed environment
     initialize_distributed_env(config=CONFIG_FILE_PATH)
@@ -92,6 +94,14 @@ def train(
     skip_batches = gpc.config.data.skip_batches
     label_smoothing = gpc.config.loss.label_smoothing
 
+    # update ckpt config
+    if enable_ckpt:
+        gpc.config.ckpt.enable_save_ckpt = True
+        gpc.config.ckpt.checkpoint_every = 5
+        gpc.config.ckpt.save_ckpt_folder = "local:llm_ckpts/"
+        gpc.config.ckpt.load_ckpt_info["content"] = ("all",)
+        gpc.config.ckpt.oss_snapshot_freq = 100
+
     # get and broadcast current time
     current_time = launch_time()
     objs = [current_time]
@@ -139,19 +149,6 @@ def train(
         dataset_types=dataset_types,
     )
 
-    # initialize trainer
-    scheduler_hooks = [
-        SchedulerMetricHook(
-            metric=metric,
-            skip=(
-                gpc.is_using_parallel_mode(ParallelMode.PIPELINE)
-                and hasattr(gpc.config.model, "num_chunks")
-                and gpc.config.model.num_chunks > 1
-                and gpc.config.parallel["pipeline"].get("interleaved_overlap", False)
-            ),
-        ),
-    ]
-
     # initialize trainer
     trainer, train_dl, _, _ = internlm.initialize_trainer(
         model=model,
@@ -213,6 +210,9 @@ def train(
             cur_loss_list.append((loss.item() - moe_loss.item() if moe_loss is not None else loss.item()))
         timer("fwd-bwd").stop()
 
+        if isp_communicator and isp_communicator.enable_memory_pool:
+            isp_communicator.memory_pool.reset_lazy_pools()
+
         # update parameters, and returns (success_update, grad_norm)
         trainer_result = trainer.step()
         assert trainer_result is not None
@@ -226,6 +226,14 @@ def train(
 
         timer("one-batch").stop()
 
+        # checkpoint the training states in specific steps, which is determined by the args "checkpoint_every"
+        # # save batch sampler that tracks the true consumed samples
+        now_break = ckpt_manager.try_save_checkpoint(train_state)
+        if now_break:
+            break
+
+    ckpt_manager.wait_async_upload_finish()
+
 
 def check_loss_spike():
     if gpc.is_rank_for_log():
@@ -311,3 +319,25 @@ def test_training_with_isp():
 
     # model training
     train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True)
+
+
+@pytest.mark.training_8GPU_ISP_SAVE_CKPT
+def test_training_with_isp_save_ckpt():
+    # update config file
+    global CONFIG_FILE_PATH
+    CONFIG_FILE_PATH = "./configs/7B_isp_sft.py"
+
+    # model training save ckpt
+    train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True, enable_ckpt=True)
+
+
+@pytest.mark.training_8GPU_ISP_LOAD_CKPT
+def test_training_with_isp_load_ckpt():
+    # update config file
+    global CONFIG_FILE_PATH
+    CONFIG_FILE_PATH = "./configs/7B_isp_sft.py"
+
+    shutil.rmtree("./llm_ckpts/10")
+
+    # model training load ckpt
+    train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True, enable_ckpt=True)