Skip to content

Commit

Permalink
feat(tests): update weekly tests
Browse files Browse the repository at this point in the history
  • Loading branch information
huangting4201 committed Feb 6, 2024
1 parent 185db08 commit 4182ed7
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 47 deletions.
8 changes: 3 additions & 5 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ env:
SLURM_PARTITION: llm_s

jobs:
training_8GPU:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -20,10 +20,10 @@ jobs:
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_8GPU
- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
training_8GPU_ISP:
runs-on: [t_cluster]
Expand All @@ -37,7 +37,6 @@ jobs:

- name: training_8GPU_ISP
run: |
source $evo_env
conda activate ${evo_env_torch21_flash2}
conda activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
Expand All @@ -54,7 +53,6 @@ jobs:

- name: training_8GPU_ISP_CKPT
run: |
source $evo_env
conda activate ${evo_env_torch21_flash2}
conda activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
Expand Down
89 changes: 72 additions & 17 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ env:
SLURM_PARTITION: llm_s

jobs:
training_8GPU:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -19,12 +19,12 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_8GPU
- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
training_16GPU_8DP2TP:
training_8GPU_4DP2TP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -36,13 +36,13 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2TP
- name: training_8GPU_4DP2TP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
training_16GPU_8DP2TPSP:
training_8GPU_4DP2TPSP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -54,13 +54,13 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2TPSP
- name: training_8GPU_4DP2TPSP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
training_16GPU_8DP2PP:
training_8GPU_4DP2PP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -72,13 +72,13 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2PP
- name: training_8GPU_4DP2PP
run: |
source $evo_env
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
training_16GPU_8DP2PP_InterleavedOverlap:
training_8GPU_4DP2PP_InterleavedOverlap:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
Expand All @@ -90,12 +90,69 @@ jobs:
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_8DP2PP_InterleavedOverlap
- name: training_8GPU_4DP2PP_InterleavedOverlap
run: |
source $evo_env
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
training_16GPU_4DP2TP2PP_MTP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_4DP2TP2PP_MTP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
training_16GPU_4DP2TP2PP_MSP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_4DP2TP2PP_MSP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
training_16GPU_4DP2TP2PP_FSP:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_4DP2TP2PP_FSP
run: |
source $evo_env
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
training_8GPU_ISP:
runs-on: [t_cluster]
Expand All @@ -109,7 +166,6 @@ jobs:

- name: training_8GPU_ISP
run: |
source $evo_env
conda activate ${evo_env_torch21_flash2}
conda activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
Expand All @@ -126,7 +182,6 @@ jobs:

- name: training_8GPU_ISP_CKPT
run: |
source $evo_env
conda activate ${evo_env_torch21_flash2}
conda activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
Expand Down
89 changes: 64 additions & 25 deletions tests/test_training/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,18 @@
TOTAL_STEPS = 10
LOSS_SPIKE_LIMIT = 1.5
LOSS_DEVIATION_LIMIT = 0.2
# dp_size = 4
BASELINE_LOSS_LIST = [
11.64188003540039,
7.9205322265625,
6.944362163543701,
6.147305488586426,
6.060564994812012,
5.660439491271973,
5.19430685043335,
5.157323837280273,
4.769168376922607,
4.449280738830566,
11.680583953857422,
7.83256721496582,
6.745327949523926,
6.187380790710449,
5.421087265014648,
5.3960981369018555,
5.090664863586426,
4.77808952331543,
4.6484055519104,
4.634660720825195
]
cur_loss_list = []

Expand All @@ -51,6 +52,7 @@ def train(
pp_size: int = 1,
num_chunks: int = 2,
interleaved: bool = False,
tp_mode: str = "mtp",
enable_sp: bool = False,
enable_ckpt: bool = False,
):
Expand Down Expand Up @@ -84,6 +86,7 @@ def train(
assert gpc.config.parallel.get(
"sequence_parallel", False
), "sequence_parallel must be True when enable_sp is True"
assert gpc.config.parallel["tensor"]["mode"] == tp_mode

# init setting
gpc.config.data.total_steps = TOTAL_STEPS
Expand Down Expand Up @@ -250,10 +253,10 @@ def check_loss_accuracy():
), f"The loss accuracy is abnormal, {target}->{cur}, please check it!"


@pytest.mark.training_8GPU
def test_training_loss_with_dp8():
@pytest.mark.training_4GPU
def test_training_loss_with_dp4():
# model training
train(dp_size=8)
train(dp_size=4)

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)
Expand All @@ -262,10 +265,10 @@ def test_training_loss_with_dp8():
check_loss_accuracy()


@pytest.mark.training_16GPU_8DP2TP
def test_training_loss_with_dp8_tp2():
@pytest.mark.training_8GPU_4DP2TP
def test_training_loss_with_dp4_tp2():
# model training
train(dp_size=8, tp_size=2)
train(dp_size=4, tp_size=2)

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)
Expand All @@ -274,10 +277,10 @@ def test_training_loss_with_dp8_tp2():
check_loss_accuracy()


@pytest.mark.training_16GPU_8DP2TPSP
def test_training_loss_with_dp8_tp2_sp():
@pytest.mark.training_8GPU_4DP2TPSP
def test_training_loss_with_dp4_tp2_sp():
# model training
train(dp_size=8, tp_size=2, enable_sp=True)
train(dp_size=4, tp_size=2, enable_sp=True)

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)
Expand All @@ -286,10 +289,10 @@ def test_training_loss_with_dp8_tp2_sp():
check_loss_accuracy()


@pytest.mark.training_16GPU_8DP2PP
def test_training_loss_with_dp8_pp2():
@pytest.mark.training_8GPU_4DP2PP
def test_training_loss_with_dp4_pp2():
# model training
train(dp_size=8, pp_size=2)
train(dp_size=4, pp_size=2)

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)
Expand All @@ -298,10 +301,46 @@ def test_training_loss_with_dp8_pp2():
check_loss_accuracy()


@pytest.mark.training_16GPU_8DP2PP_InterleavedOverlap
def test_training_loss_with_dp8_pp2_interleaved_overlap():
@pytest.mark.training_8GPU_4DP2PP_InterleavedOverlap
def test_training_loss_with_dp4_pp2_interleaved_overlap():
# model training
train(dp_size=8, pp_size=2, interleaved=True)
train(dp_size=4, pp_size=2, interleaved=True)

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)

check_loss_spike()
check_loss_accuracy()


@pytest.mark.training_16GPU_4DP2TP2PP_MTP
def test_training_loss_with_dp4_tp2_pp2_mtp():
# model training
train(dp_size=4, tp_size=2, pp_size=2)

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)

check_loss_spike()
check_loss_accuracy()


@pytest.mark.training_16GPU_4DP2TP2PP_MSP
def test_training_loss_with_dp4_tp2_pp2_msp():
# model training
train(dp_size=4, tp_size=2, pp_size=2, tp_mode="msp")

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)

check_loss_spike()
check_loss_accuracy()


@pytest.mark.training_16GPU_4DP2TP2PP_FSP
def test_training_loss_with_dp4_tp2_pp2_fsp():
# model training
train(dp_size=4, tp_size=2, pp_size=2, tp_mode="fsp")

# print loss value
print(f"cur_loss_list: {cur_loss_list}", flush=True)
Expand Down

0 comments on commit 4182ed7

Please sign in to comment.