feat(tests): update ci e2e tests (#45)

InternLM · Feb 19, 2024 · be32910 · be32910
1 parent 1c3b892
commit be32910
Show file tree

Hide file tree

Showing 6 changed files with 248 additions and 73 deletions.
diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
@@ -10,7 +10,7 @@ env:
   SLURM_PARTITION: llm_s
 
 jobs:
-  training_8GPU:
+  training_4GPU:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -20,10 +20,10 @@ jobs:
         echo "::add-mask::$path_prefix"
     - uses: actions/checkout@v3
 
-    - name: training_8GPU
+    - name: training_4GPU
       run: |
         source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
 
   training_8GPU_ISP:
     runs-on: [t_cluster]
@@ -37,7 +37,21 @@ jobs:
 
     - name: training_8GPU_ISP
       run: |
-        source $evo_env
-        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
-        conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2
+        source activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
+
+  training_8GPU_ISP_CKPT:
+    runs-on: [t_cluster]
+    timeout-minutes: 20
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU_ISP_CKPT
+      run: |
+        source activate ${evo_env_torch21_flash2}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
@@ -7,7 +7,7 @@ env:
   SLURM_PARTITION: llm_s
 
 jobs:
-  training_8GPU:
+  training_4GPU:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -19,12 +19,12 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_8GPU
+    - name: training_4GPU
       run: |
         source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
 
-  training_16GPU_8DP2TP:
+  training_8GPU_4DP2TP:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -36,13 +36,13 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2TP
+    - name: training_8GPU_4DP2TP
       run: |
         source $evo_env
         sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
 
-  training_16GPU_8DP2TPSP:
+  training_8GPU_4DP2TPSP:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -54,14 +54,13 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2TPSP
+    - name: training_8GPU_4DP2TPSP
       run: |
         source $evo_env
-        sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
-        sed -i 's/^.*sequence_parallel=.*/    sequence_parallel=True,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
             
-  training_16GPU_8DP2PP:
+  training_8GPU_4DP2PP:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -73,13 +72,13 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2PP
+    - name: training_8GPU_4DP2PP
       run: |
         source $evo_env
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
 
-  training_16GPU_8DP2PP_InterleavedOverlap:
+  training_8GPU_4DP2PP_InterleavedOverlap:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -91,12 +90,100 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2PP_InterleavedOverlap
+    - name: training_8GPU_4DP2PP_InterleavedOverlap
       run: |
         source $evo_env
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
         sed -i 's/^.*num_chunks=.*/    num_chunks=2,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
+
+  training_16GPU_4DP2TP2PP_MTP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_16GPU_4DP2TP2PP_MTP
+      run: |
+        source $evo_env
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
+
+  training_16GPU_4DP2TP2PP_MSP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_16GPU_4DP2TP2PP_MSP
+      run: |
+        source $evo_env
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
+
+  training_16GPU_4DP2TP2PP_FSP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_16GPU_4DP2TP2PP_FSP
+      run: |
+        source $evo_env
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
+
+  training_8GPU_ISP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU_ISP
+      run: |
+        source activate ${evo_env_torch21_flash2}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
+
+  training_8GPU_ISP_CKPT:
+    runs-on: [t_cluster]
+    timeout-minutes: 20
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU_ISP_CKPT
+      run: |
+        source activate ${evo_env_torch21_flash2}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
 
   unit_test_optimizer:
     runs-on: [t_cluster]
@@ -162,11 +249,16 @@ jobs:
   notify_to_feishu:
     if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
     needs: [
-      training_8GPU,
-      training_16GPU_8DP2TP,
-      training_16GPU_8DP2TPSP,
-      training_16GPU_8DP2PP,
-      training_16GPU_8DP2PP_InterleavedOverlap,
+      training_4GPU,
+      training_8GPU_4DP2TP,
+      training_8GPU_4DP2TPSP,
+      training_8GPU_4DP2PP,
+      training_8GPU_4DP2PP_InterleavedOverlap,
+      training_16GPU_4DP2TP2PP_MTP,
+      training_16GPU_4DP2TP2PP_MSP,
+      training_16GPU_4DP2TP2PP_FSP,
+      training_8GPU_ISP,
+      training_8GPU_ISP_CKPT,
       unit_test_optimizer,
       unit_test_model,
       load_ckpt_then_assert_loss

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
@@ -173,7 +173,7 @@
     3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=8),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True, memory_pool=True),

diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
@@ -322,6 +322,8 @@ def save_model_checkpoint(folder, model):
         pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
         wdp_rank = gpc.get_local_rank(ParallelMode.WEIGHT_DATA)
 
+        should_save_rank_pair = set()  # (tp_rank, dp_rank)
+
         # TODO In theory, we should also consider pp level, but since pp is generally a state across machines,
         # even if pp is not considered, it will definitely not be written on the same machine.
 
@@ -336,7 +338,6 @@ def save_model_checkpoint(folder, model):
                 llm_save(topo_fp, saved_obj=topo)
         else:
             # for tensor parallel mode with mtp/msp/fsp
-            should_save_rank_pair = set()  # (tp_rank, dp_rank)
             for i in range(tp_size):
                 if gpc.config.parallel.zero1.fsdp:
                     for j in range(dp_size):

diff --git a/tests/test_training/test_forward_output_no_fa.py b/tests/test_training/test_forward_output_no_fa.py
@@ -11,10 +11,9 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.context.parallel_context import Config
-from internlm.core.scheduler import SchedulerMetricHook
 from internlm.initialize.launch import args_sanity_check
 from internlm.model.loss import FlashGPTLMLoss
-from internlm.model.metrics import AccPerplex
+from internlm.model.metrics import AccPerplex, SchedulerMetricHook
 from internlm.train import get_train_data_loader, initialize_model, initialize_optimizer
 from internlm.utils.logger import get_logger