Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/unit-tests-on-slurm' into 'main'
Browse files Browse the repository at this point in the history
ci: Run unit tests on Slurm

See merge request ADLR/megatron-lm!2410
  • Loading branch information
ko3n1g committed Dec 3, 2024
2 parents 22f9a79 + 522e567 commit 9f1ef85
Show file tree
Hide file tree
Showing 27 changed files with 466 additions and 164 deletions.
8 changes: 4 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ workflow:
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
UNIT_TEST_TIMEOUT: 15
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
Expand All @@ -25,7 +25,7 @@ workflow:
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
UNIT_TEST_TIMEOUT: 15
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
Expand All @@ -36,7 +36,7 @@ workflow:
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
UNIT_TEST_TIMEOUT: 15
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1
Expand Down Expand Up @@ -72,7 +72,7 @@ variables:
value: '1'
description: 'Number of repetitions'
UNIT_TEST_TIMEOUT:
value: '10'
value: '30'
description: Timeout (minutes) for Unit tests (all repeats)
FUNCTIONAL_TEST:
value: 'yes'
Expand Down
212 changes: 116 additions & 96 deletions .gitlab/stages/01.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ test:build_image:
DOCKER_TLS_VERIFY: 1
DOCKER_CERT_PATH: '$DOCKER_TLS_CERTDIR/client'
TAG: purpose/builder-large
STAGE: main
STAGE: jet
script:
- apk add bash
- |
Expand Down Expand Up @@ -88,127 +88,147 @@ test:build_image:
retry:
max: 2

.unit_tests:
extends: [.test_rules, .dind_rules]
test:unit_tests_configure:
extends: [.test_rules]
needs:
- test:build_image
- test:docs_build
- test:formatting
- test:copyright
timeout: 180m
tags: [8xL40S]
variables:
GIT_STRATEGY: none
parallel:
matrix:
- BUCKET: tests/unit_tests/data/
BACKWARDS: 'true'
- BUCKET: tests/unit_tests/dist_checkpointing/
BACKWARDS: 'true'
- BUCKET: tests/unit_tests/distributed/
BACKWARDS: 'true'
- BUCKET: other
BACKWARDS: 'true'
- BUCKET: tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py tests/unit_tests/test_training.py
BACKWARDS: 'false'
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags: [mcore-docker-node-small]
before_script:
- git rm -r tests/test_utils/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
- ls tests/test_utils/local_recipes
script:
- docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e BACKWARDS -e TAG -e IMAGE -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
- set -x
- |
CMD=$(cat <<"RUN_TEST_EOF"
set -euxo pipefail
MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
if [[ "$TAG" != "latest" && $BACKWARDS == "false" ]]; then
echo "No backwards checks on $BUCKET"
exit 0
fi
cd /opt/megatron-lm$MCORE_DIR;
for i in $(seq $UNIT_TEST_REPEAT); do
SEED=$((RANDOM % 9000 + 1000));
MARKER=()
if [[ $TAG != latest ]]; then
MARKER+=("not internal")
fi
if [[ "$IMAGE" == *dev* ]]; then
MARKER+=("not flaky_in_dev")
else
MARKER+=("not flaky")
fi
MARKER_ARG=$(printf "%s" "${MARKER[0]}")
for element in "${MARKER[@]:1}"; do
MARKER_ARG+=" and $element"
done
if [[ $BUCKET == other ]]; then
BUCKETS=($(cat /opt/megatron-lm/.gitlab/stages/01.test.yml | yq '.".unit_tests".parallel.matrix | del(.[] | select(.BUCKET == "other")) | .[].BUCKET' | tr " " "\n" | sed 's/[^ ]*/--ignore &/g' | tr "\n" " "))
IGNORE_ARGS=(${BUCKETS[@]})
BUCKET=tests/unit_tests
else
IGNORE_ARGS=()
BUCKET=${BUCKET}
fi
if [[ -d $BUCKET ]]; then
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${IGNORE_ARGS[@]}" -m "${MARKER_ARG}" $BUCKET
fi
done
RUN_TEST_EOF
)
A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment lts \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
--tag "legacy" \
--output-path "unit-test-job-lts-legacy.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment lts \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
--tag "latest" \
--output-path "unit-test-job-lts-latest.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment dev \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
--tag "legacy" \
--output-path "unit-test-job-dev-legacy.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment dev \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
--tag "latest" \
--output-path "unit-test-job-dev-latest.yaml"
docker exec mcore_ci_${CI_PIPELINE_ID} bash -c "$CMD"
after_script:
- docker container stop mcore_ci_${CI_PIPELINE_ID} || true
artifacts:
paths:
- coverage
- unit-test-job-dev-legacy.yaml
- unit-test-job-dev-latest.yaml
- unit-test-job-lts-legacy.yaml
- unit-test-job-lts-latest.yaml
- tests/test_utils/local_recipes

.unit_tests_run:
needs:
- test:formatting
- test:copyright
- test:secret_detection
- test:unit_tests_configure
extends: [.test_rules]
trigger:
include:
- artifact: unit-test-job-$ENVIRONMENT-$TAG.yaml
job: test:unit_tests_configure
strategy: depend
variables:
RO_API_TOKEN: $PAT
CONTAINER_TAG: $CI_PIPELINE_ID
CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
GITLAB_ENDPOINT: $GITLAB_ENDPOINT
PARENT_PIPELINE_ID: $CI_PIPELINE_ID
inherit:
variables: true
rules:
- if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0'
- if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: on_success
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
when: on_success

test:pyt(LTS)_mcore(latest):
extends: [.unit_tests]
needs:
- test:pyt(LTS)_mcore(0.9.0)
- test:pyt(DEV)_mcore(0.9.0)
test:unit_tests_pyt(DEV)_mcore(legacy):
extends: [.unit_tests_run]
variables:
TAG: latest
IMAGE: ${CI_MCORE_LTS_IMAGE}
ENVIRONMENT: dev
TAG: legacy

test:pyt(LTS)_mcore(0.9.0):
extends: [.unit_tests]
test:unit_tests_pyt(LTS)_mcore(legacy):
extends: [.unit_tests_run]
variables:
TAG: core_r0.9.0
IMAGE: ${CI_MCORE_LTS_IMAGE}
ENVIRONMENT: dev
TAG: legacy

test:pyt(DEV)_mcore(latest):
extends: [.unit_tests]
needs:
- test:pyt(LTS)_mcore(0.9.0)
- test:pyt(DEV)_mcore(0.9.0)
test:unit_tests_pyt(DEV)_mcore(latest):
extends: [.unit_tests_run]
variables:
ENVIRONMENT: lts
TAG: latest
IMAGE: ${CI_MCORE_DEV_IMAGE}

test:pyt(DEV)_mcore(0.9.0):
extends: [.unit_tests]
test:unit_tests_pyt(LTS)_mcore(latest):
extends: [.unit_tests_run]
variables:
TAG: core_r0.9.0
IMAGE: ${CI_MCORE_DEV_IMAGE}
ENVIRONMENT: lts
TAG: latest

test:notify_unit_tests:
extends: [.test_rules]
image: badouralix/curl-jq
needs:
- test:pyt(LTS)_mcore(latest)
- test:pyt(DEV)_mcore(latest)
- test:pyt(LTS)_mcore(0.9.0)
- test:pyt(DEV)_mcore(0.9.0)
- test:unit_tests_pyt(DEV)_mcore(latest)
- test:unit_tests_pyt(LTS)_mcore(latest)
tags:
- mcore-docker-node-small
script:
Expand Down
40 changes: 15 additions & 25 deletions .gitlab/stages/02.functional-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,19 @@ include:
ref: main
file: downstreams.yml

functional:build_image:
extends: [test:build_image, .functional_tests_rules]
needs:
- test:build_image
- test:docs_build
- test:formatting
- test:copyright
variables:
STAGE: jet
TAG: purpose/builder-small

functional:configure:
needs:
- functional:build_image
- job: test:pyt(LTS)_mcore(latest)
- job: test:unit_tests_pyt(DEV)_mcore(latest)
optional: true
- job: test:pyt(DEV)_mcore(latest)
- job: test:unit_tests_pyt(LTS)_mcore(latest)
optional: true
extends: [.functional_tests_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags: [mcore-docker-node-small]
before_script:
- git rm -r tests/functional_tests/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
- ls tests/functional_tests/local_recipes
- git rm -r tests/test_utils/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
- ls tests/test_utils/local_recipes
script:
- set -x
- |
Expand All @@ -60,7 +48,7 @@ functional:configure:
fi
- |
export PYTHONPATH=$(pwd)
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
python tests/test_utils/scripts/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
--environment dev \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
Expand All @@ -70,11 +58,12 @@ functional:configure:
--h100-cluster $H100_CLUSTER \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--output-path "jet-trigger-job-dev.yaml" \
--dependent-job "functional:configure" \
--output-path "functional-test-job-dev.yaml" \
${RELEASE_ARGS[@]}
- |
export PYTHONPATH=$(pwd)
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
python tests/test_utils/scripts/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
--environment lts \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
Expand All @@ -84,21 +73,22 @@ functional:configure:
--h100-cluster $H100_CLUSTER \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--output-path "jet-trigger-job-lts.yaml" \
--dependent-job "functional:configure" \
--output-path "functional-test-job-lts.yaml" \
${RELEASE_ARGS[@]}
artifacts:
paths:
- jet-trigger-job-lts.yaml
- jet-trigger-job-dev.yaml
- tests/functional_tests/local_recipes
- functional-test-job-lts.yaml
- functional-test-job-dev.yaml
- tests/test_utils/local_recipes

.run:
stage: functional_tests
needs: [functional:configure]
extends: [.functional_tests_rules]
trigger:
include:
- artifact: jet-trigger-job-$ENVIRONMENT.yaml
- artifact: functional-test-job-$ENVIRONMENT.yaml
job: functional:configure
strategy: depend
variables:
Expand Down
Loading

0 comments on commit 9f1ef85

Please sign in to comment.