feat(checkpoint): support universal checkpoint #1252
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: e2e-tests | |
on: | |
pull_request: | |
branches: | |
- "develop" | |
paths-ignore: | |
- "doc/**" | |
- "**.md" | |
env: | |
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) | |
SLURM_PARTITION: llm_s | |
jobs: | |
training_4GPU: | |
runs-on: [t_cluster] | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
- uses: actions/checkout@v3 | |
- name: training_4GPU | |
run: | | |
source activate ${evo_env_torch21_flash2} | |
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py | |
exit_code=$? | |
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname | |
training_8GPU_ISP: | |
runs-on: [t_cluster] | |
timeout-minutes: 10 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
- uses: actions/checkout@v3 | |
- name: training_8GPU_ISP | |
run: | | |
source activate ${evo_env_torch21_flash2} | |
jobname=ISP-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py | |
exit_code=$? | |
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname | |
training_8GPU_ISP_CKPT: | |
runs-on: [t_cluster] | |
timeout-minutes: 20 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
- uses: actions/checkout@v3 | |
- name: training_8GPU_ISP_CKPT | |
run: | | |
source activate ${evo_env_torch21_flash2} | |
jobname=ISP_CKPT-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py | |
exit_code=$? | |
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname | |
jobname=LOAD-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py | |
exit_code=$? | |
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname | |
training_8GPU_4DP2TP: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_8GPU_4DP2TP_910B | |
if: ${{ matrix.runner == '910B' }} | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" | |
training_8GPU_4DP2TPSP: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_8GPU_4DP2TPSP_910B | |
if: ${{ matrix.runner == '910B' }} | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" | |
training_8GPU_4DP2PP: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_8GPU_4DP2PP_910B | |
if: ${{ matrix.runner == '910B' }} | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" | |
training_8GPU_4DP2PP_ZB: | |
runs-on: [t_cluster] | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
- uses: actions/checkout@v3 | |
- name: training_8GPU_4DP2PP_ZB | |
run: | | |
source activate ${evo_env_torch21_flash2} | |
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py | |
exit_code=$? | |
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname | |
training_16GPU_4DP2TP2PP_MTP: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_16GPU_4DP2TP2PP_MTP_910B | |
if: ${{ matrix.runner == '910B' }} | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce" | |
training_16GPU_4DP2TP2PP_MSP: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_16GPU_4DP2TP2PP_MSP_910B | |
if: ${{ matrix.runner == '910B' }} | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce" | |
training_16GPU_4DP2TP2PP_FSP: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 15 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_16GPU_4DP2TP2PP_FSP_910B | |
if: ${{ matrix.runner == '910B' }} | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce" | |
training_llama2: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 20 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_llama2_910B | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" | |
training_internlm2: | |
strategy: | |
matrix: | |
runner: [910B] | |
runs-on: ${{ matrix.runner }} | |
timeout-minutes: 20 | |
steps: | |
- name: mask env | |
run: | | |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
echo "::add-mask::$path_prefix" | |
if [[ ${{ matrix.runner }} == 910B ]];then | |
sudo git clean -ffdx | |
fi | |
- uses: actions/checkout@v3 | |
- name: training_internlm2_910B | |
run: | | |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py' | |
bash ../910B_sco.sh $jobname "$start_command" |