feat(tests): update weekly tests

InternLM · Feb 6, 2024 · 4182ed7 · 4182ed7
1 parent 185db08
commit 4182ed7
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 47 deletions.
diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
@@ -10,7 +10,7 @@ env:
   SLURM_PARTITION: llm_s
 
 jobs:
-  training_8GPU:
+  training_4GPU:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -20,10 +20,10 @@ jobs:
         echo "::add-mask::$path_prefix"
     - uses: actions/checkout@v3
 
-    - name: training_8GPU
+    - name: training_4GPU
       run: |
         source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
 
   training_8GPU_ISP:
     runs-on: [t_cluster]
@@ -37,7 +37,6 @@ jobs:
 
     - name: training_8GPU_ISP
       run: |
-        source $evo_env
         conda activate ${evo_env_torch21_flash2}
         conda activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
@@ -54,7 +53,6 @@ jobs:
 
     - name: training_8GPU_ISP_CKPT
       run: |
-        source $evo_env
         conda activate ${evo_env_torch21_flash2}
         conda activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py

diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
@@ -7,7 +7,7 @@ env:
   SLURM_PARTITION: llm_s
 
 jobs:
-  training_8GPU:
+  training_4GPU:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -19,12 +19,12 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_8GPU
+    - name: training_4GPU
       run: |
         source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
 
-  training_16GPU_8DP2TP:
+  training_8GPU_4DP2TP:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -36,13 +36,13 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2TP
+    - name: training_8GPU_4DP2TP
       run: |
         source $evo_env
         sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
 
-  training_16GPU_8DP2TPSP:
+  training_8GPU_4DP2TPSP:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -54,13 +54,13 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2TPSP
+    - name: training_8GPU_4DP2TPSP
       run: |
         source $evo_env
         sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
             
-  training_16GPU_8DP2PP:
+  training_8GPU_4DP2PP:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -72,13 +72,13 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2PP
+    - name: training_8GPU_4DP2PP
       run: |
         source $evo_env
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
 
-  training_16GPU_8DP2PP_InterleavedOverlap:
+  training_8GPU_4DP2PP_InterleavedOverlap:
     runs-on: [t_cluster]
     timeout-minutes: 10
     steps:
@@ -90,12 +90,69 @@ jobs:
       with:
          ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
 
-    - name: training_16GPU_8DP2PP_InterleavedOverlap
+    - name: training_8GPU_4DP2PP_InterleavedOverlap
       run: |
         source $evo_env
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
         sed -i 's/^.*num_chunks=.*/    num_chunks=2,/' ./configs/7B_sft.py
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
+
+  training_16GPU_4DP2TP2PP_MTP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_16GPU_4DP2TP2PP_MTP
+      run: |
+        source $evo_env
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
+
+  training_16GPU_4DP2TP2PP_MSP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_16GPU_4DP2TP2PP_MSP
+      run: |
+        source $evo_env
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
+
+  training_16GPU_4DP2TP2PP_FSP:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_16GPU_4DP2TP2PP_FSP
+      run: |
+        source $evo_env
+        sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
 
   training_8GPU_ISP:
     runs-on: [t_cluster]
@@ -109,7 +166,6 @@ jobs:
 
     - name: training_8GPU_ISP
       run: |
-        source $evo_env
         conda activate ${evo_env_torch21_flash2}
         conda activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
@@ -126,7 +182,6 @@ jobs:
 
     - name: training_8GPU_ISP_CKPT
       run: |
-        source $evo_env
         conda activate ${evo_env_torch21_flash2}
         conda activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py

diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
@@ -29,17 +29,18 @@
 TOTAL_STEPS = 10
 LOSS_SPIKE_LIMIT = 1.5
 LOSS_DEVIATION_LIMIT = 0.2
+# dp_size = 4
 BASELINE_LOSS_LIST = [
-    11.64188003540039,
-    7.9205322265625,
-    6.944362163543701,
-    6.147305488586426,
-    6.060564994812012,
-    5.660439491271973,
-    5.19430685043335,
-    5.157323837280273,
-    4.769168376922607,
-    4.449280738830566,
+    11.680583953857422, 
+    7.83256721496582, 
+    6.745327949523926, 
+    6.187380790710449, 
+    5.421087265014648, 
+    5.3960981369018555, 
+    5.090664863586426, 
+    4.77808952331543, 
+    4.6484055519104, 
+    4.634660720825195
 ]
 cur_loss_list = []
 
@@ -51,6 +52,7 @@ def train(
     pp_size: int = 1,
     num_chunks: int = 2,
     interleaved: bool = False,
+    tp_mode: str = "mtp",
     enable_sp: bool = False,
     enable_ckpt: bool = False,
 ):
@@ -84,6 +86,7 @@ def train(
         assert gpc.config.parallel.get(
             "sequence_parallel", False
         ), "sequence_parallel must be True when enable_sp is True"
+    assert gpc.config.parallel["tensor"]["mode"] == tp_mode
 
     # init setting
     gpc.config.data.total_steps = TOTAL_STEPS
@@ -250,10 +253,10 @@ def check_loss_accuracy():
             ), f"The loss accuracy is abnormal, {target}->{cur}, please check it!"
 
 
-@pytest.mark.training_8GPU
-def test_training_loss_with_dp8():
+@pytest.mark.training_4GPU
+def test_training_loss_with_dp4():
     # model training
-    train(dp_size=8)
+    train(dp_size=4)
 
     # print loss value
     print(f"cur_loss_list: {cur_loss_list}", flush=True)
@@ -262,10 +265,10 @@ def test_training_loss_with_dp8():
     check_loss_accuracy()
 
 
-@pytest.mark.training_16GPU_8DP2TP
-def test_training_loss_with_dp8_tp2():
+@pytest.mark.training_8GPU_4DP2TP
+def test_training_loss_with_dp4_tp2():
     # model training
-    train(dp_size=8, tp_size=2)
+    train(dp_size=4, tp_size=2)
 
     # print loss value
     print(f"cur_loss_list: {cur_loss_list}", flush=True)
@@ -274,10 +277,10 @@ def test_training_loss_with_dp8_tp2():
     check_loss_accuracy()
 
 
-@pytest.mark.training_16GPU_8DP2TPSP
-def test_training_loss_with_dp8_tp2_sp():
+@pytest.mark.training_8GPU_4DP2TPSP
+def test_training_loss_with_dp4_tp2_sp():
     # model training
-    train(dp_size=8, tp_size=2, enable_sp=True)
+    train(dp_size=4, tp_size=2, enable_sp=True)
 
     # print loss value
     print(f"cur_loss_list: {cur_loss_list}", flush=True)
@@ -286,10 +289,10 @@ def test_training_loss_with_dp8_tp2_sp():
     check_loss_accuracy()
 
 
-@pytest.mark.training_16GPU_8DP2PP
-def test_training_loss_with_dp8_pp2():
+@pytest.mark.training_8GPU_4DP2PP
+def test_training_loss_with_dp4_pp2():
     # model training
-    train(dp_size=8, pp_size=2)
+    train(dp_size=4, pp_size=2)
 
     # print loss value
     print(f"cur_loss_list: {cur_loss_list}", flush=True)
@@ -298,10 +301,46 @@ def test_training_loss_with_dp8_pp2():
     check_loss_accuracy()
 
 
-@pytest.mark.training_16GPU_8DP2PP_InterleavedOverlap
-def test_training_loss_with_dp8_pp2_interleaved_overlap():
+@pytest.mark.training_8GPU_4DP2PP_InterleavedOverlap
+def test_training_loss_with_dp4_pp2_interleaved_overlap():
     # model training
-    train(dp_size=8, pp_size=2, interleaved=True)
+    train(dp_size=4, pp_size=2, interleaved=True)
+
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
+
+    check_loss_spike()
+    check_loss_accuracy()
+
+
+@pytest.mark.training_16GPU_4DP2TP2PP_MTP
+def test_training_loss_with_dp4_tp2_pp2_mtp():
+    # model training
+    train(dp_size=4, tp_size=2, pp_size=2)
+
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
+
+    check_loss_spike()
+    check_loss_accuracy()
+
+
+@pytest.mark.training_16GPU_4DP2TP2PP_MSP
+def test_training_loss_with_dp4_tp2_pp2_msp():
+    # model training
+    train(dp_size=4, tp_size=2, pp_size=2, tp_mode="msp")
+
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
+
+    check_loss_spike()
+    check_loss_accuracy()
+
+
+@pytest.mark.training_16GPU_4DP2TP2PP_FSP
+def test_training_loss_with_dp4_tp2_pp2_fsp():
+    # model training
+    train(dp_size=4, tp_size=2, pp_size=2, tp_mode="fsp")
 
     # print loss value
     print(f"cur_loss_list: {cur_loss_list}", flush=True)