diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c22b87d418..b24e9dd0b7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,7 +14,7 @@ workflow: - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: UNIT_TEST_REPEAT: 1 - UNIT_TEST_TIMEOUT: 10 + UNIT_TEST_TIMEOUT: 15 FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 5 @@ -25,7 +25,7 @@ workflow: - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: UNIT_TEST_REPEAT: 1 - UNIT_TEST_TIMEOUT: 10 + UNIT_TEST_TIMEOUT: 15 FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: nightly FUNCTIONAL_TEST_REPEAT: 5 @@ -36,7 +36,7 @@ workflow: - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: UNIT_TEST_REPEAT: 1 - UNIT_TEST_TIMEOUT: 10 + UNIT_TEST_TIMEOUT: 15 FUNCTIONAL_TEST: 'yes' FUNCTIONAL_TEST_SCOPE: weekly FUNCTIONAL_TEST_REPEAT: 1 @@ -72,7 +72,7 @@ variables: value: '1' description: 'Number of repetitions' UNIT_TEST_TIMEOUT: - value: '10' + value: '30' description: Timeout (minutes) for Unit tests (all repeats) FUNCTIONAL_TEST: value: 'yes' diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml index 8512adde2b..fa9324ac4a 100644 --- a/.gitlab/stages/01.test.yml +++ b/.gitlab/stages/01.test.yml @@ -41,7 +41,7 @@ test:build_image: DOCKER_TLS_VERIFY: 1 DOCKER_CERT_PATH: '$DOCKER_TLS_CERTDIR/client' TAG: purpose/builder-large - STAGE: main + STAGE: jet script: - apk add bash - | @@ -88,127 +88,147 @@ test:build_image: retry: max: 2 -.unit_tests: - extends: [.test_rules, .dind_rules] +test:unit_tests_configure: + extends: [.test_rules] needs: - test:build_image - - test:docs_build - - test:formatting - - test:copyright - timeout: 180m - tags: [8xL40S] - variables: - GIT_STRATEGY: none - parallel: - matrix: - - BUCKET: tests/unit_tests/data/ - BACKWARDS: 'true' - - BUCKET: tests/unit_tests/dist_checkpointing/ - BACKWARDS: 'true' - - BUCKET: tests/unit_tests/distributed/ - BACKWARDS: 'true' - - BUCKET: other - BACKWARDS: 'true' - - BUCKET: tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py tests/unit_tests/test_training.py - BACKWARDS: 'false' + image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} + tags: [mcore-docker-node-small] + before_script: + - git rm -r tests/test_utils/local_recipes || true + - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes + - ls tests/test_utils/local_recipes script: - - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e BACKWARDS -e TAG -e IMAGE -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))" + - set -x - | - CMD=$(cat <<"RUN_TEST_EOF" - set -euxo pipefail - - MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/") - - if [[ "$TAG" != "latest" && $BACKWARDS == "false" ]]; then - echo "No backwards checks on $BUCKET" - exit 0 - fi - - cd /opt/megatron-lm$MCORE_DIR; - - for i in $(seq $UNIT_TEST_REPEAT); do - SEED=$((RANDOM % 9000 + 1000)); - MARKER=() - if [[ $TAG != latest ]]; then - MARKER+=("not internal") - fi - if [[ "$IMAGE" == *dev* ]]; then - MARKER+=("not flaky_in_dev") - else - MARKER+=("not flaky") - fi - MARKER_ARG=$(printf "%s" "${MARKER[0]}") - for element in "${MARKER[@]:1}"; do - MARKER_ARG+=" and $element" - done - - if [[ $BUCKET == other ]]; then - BUCKETS=($(cat /opt/megatron-lm/.gitlab/stages/01.test.yml | yq '.".unit_tests".parallel.matrix | del(.[] | select(.BUCKET == "other")) | .[].BUCKET' | tr " " "\n" | sed 's/[^ ]*/--ignore &/g' | tr "\n" " ")) - IGNORE_ARGS=(${BUCKETS[@]}) - BUCKET=tests/unit_tests - else - IGNORE_ARGS=() - BUCKET=${BUCKET} - fi - - if [[ -d $BUCKET ]]; then - timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${IGNORE_ARGS[@]}" -m "${MARKER_ARG}" $BUCKET - fi - done - RUN_TEST_EOF - ) + A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER) + H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/scripts/generate_jet_trigger_job.py \ + --scope "unit-tests" \ + --environment lts \ + --n-repeat "${UNIT_TEST_REPEAT}" \ + --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \ + --test-cases "all" \ + --a100-cluster "dgxa100_dracooci-ord" \ + --h100-cluster "dgxh100_coreweave" \ + --container-image ${UTILITY_IMAGE} \ + --container-tag ${CI_PIPELINE_ID} \ + --dependent-job "test:unit_tests_configure" \ + --tag "legacy" \ + --output-path "unit-test-job-lts-legacy.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/scripts/generate_jet_trigger_job.py \ + --scope "unit-tests" \ + --environment lts \ + --n-repeat "${UNIT_TEST_REPEAT}" \ + --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \ + --test-cases "all" \ + --a100-cluster "dgxa100_dracooci-ord" \ + --h100-cluster "dgxh100_coreweave" \ + --container-image ${UTILITY_IMAGE} \ + --container-tag ${CI_PIPELINE_ID} \ + --dependent-job "test:unit_tests_configure" \ + --tag "latest" \ + --output-path "unit-test-job-lts-latest.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/scripts/generate_jet_trigger_job.py \ + --scope "unit-tests" \ + --environment dev \ + --n-repeat "${UNIT_TEST_REPEAT}" \ + --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \ + --test-cases "all" \ + --a100-cluster "dgxa100_dracooci-ord" \ + --h100-cluster "dgxh100_coreweave" \ + --container-image ${UTILITY_IMAGE} \ + --container-tag ${CI_PIPELINE_ID} \ + --dependent-job "test:unit_tests_configure" \ + --tag "legacy" \ + --output-path "unit-test-job-dev-legacy.yaml" + - | + export PYTHONPATH=$(pwd) + python tests/test_utils/scripts/generate_jet_trigger_job.py \ + --scope "unit-tests" \ + --environment dev \ + --n-repeat "${UNIT_TEST_REPEAT}" \ + --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \ + --test-cases "all" \ + --a100-cluster "dgxa100_dracooci-ord" \ + --h100-cluster "dgxh100_coreweave" \ + --container-image ${UTILITY_IMAGE} \ + --container-tag ${CI_PIPELINE_ID} \ + --dependent-job "test:unit_tests_configure" \ + --tag "latest" \ + --output-path "unit-test-job-dev-latest.yaml" - docker exec mcore_ci_${CI_PIPELINE_ID} bash -c "$CMD" - after_script: - - docker container stop mcore_ci_${CI_PIPELINE_ID} || true artifacts: paths: - - coverage + - unit-test-job-dev-legacy.yaml + - unit-test-job-dev-latest.yaml + - unit-test-job-lts-legacy.yaml + - unit-test-job-lts-latest.yaml + - tests/test_utils/local_recipes + +.unit_tests_run: + needs: + - test:formatting + - test:copyright + - test:secret_detection + - test:unit_tests_configure + extends: [.test_rules] + trigger: + include: + - artifact: unit-test-job-$ENVIRONMENT-$TAG.yaml + job: test:unit_tests_configure + strategy: depend + variables: + RO_API_TOKEN: $PAT + CONTAINER_TAG: $CI_PIPELINE_ID + CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE + GITLAB_ENDPOINT: $GITLAB_ENDPOINT + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + inherit: + variables: true rules: - - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0' + - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" allow_failure: true when: on_success - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0' when: on_success -test:pyt(LTS)_mcore(latest): - extends: [.unit_tests] - needs: - - test:pyt(LTS)_mcore(0.9.0) - - test:pyt(DEV)_mcore(0.9.0) +test:unit_tests_pyt(DEV)_mcore(legacy): + extends: [.unit_tests_run] variables: - TAG: latest - IMAGE: ${CI_MCORE_LTS_IMAGE} + ENVIRONMENT: dev + TAG: legacy -test:pyt(LTS)_mcore(0.9.0): - extends: [.unit_tests] +test:unit_tests_pyt(LTS)_mcore(legacy): + extends: [.unit_tests_run] variables: - TAG: core_r0.9.0 - IMAGE: ${CI_MCORE_LTS_IMAGE} + ENVIRONMENT: dev + TAG: legacy -test:pyt(DEV)_mcore(latest): - extends: [.unit_tests] - needs: - - test:pyt(LTS)_mcore(0.9.0) - - test:pyt(DEV)_mcore(0.9.0) +test:unit_tests_pyt(DEV)_mcore(latest): + extends: [.unit_tests_run] variables: + ENVIRONMENT: lts TAG: latest - IMAGE: ${CI_MCORE_DEV_IMAGE} -test:pyt(DEV)_mcore(0.9.0): - extends: [.unit_tests] +test:unit_tests_pyt(LTS)_mcore(latest): + extends: [.unit_tests_run] variables: - TAG: core_r0.9.0 - IMAGE: ${CI_MCORE_DEV_IMAGE} + ENVIRONMENT: lts + TAG: latest test:notify_unit_tests: extends: [.test_rules] image: badouralix/curl-jq needs: - - test:pyt(LTS)_mcore(latest) - - test:pyt(DEV)_mcore(latest) - - test:pyt(LTS)_mcore(0.9.0) - - test:pyt(DEV)_mcore(0.9.0) + - test:unit_tests_pyt(DEV)_mcore(latest) + - test:unit_tests_pyt(LTS)_mcore(latest) tags: - mcore-docker-node-small script: diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml index 7a0e4d6722..da31199216 100644 --- a/.gitlab/stages/02.functional-tests.yml +++ b/.gitlab/stages/02.functional-tests.yml @@ -16,31 +16,19 @@ include: ref: main file: downstreams.yml -functional:build_image: - extends: [test:build_image, .functional_tests_rules] - needs: - - test:build_image - - test:docs_build - - test:formatting - - test:copyright - variables: - STAGE: jet - TAG: purpose/builder-small - functional:configure: needs: - - functional:build_image - - job: test:pyt(LTS)_mcore(latest) + - job: test:unit_tests_pyt(DEV)_mcore(latest) optional: true - - job: test:pyt(DEV)_mcore(latest) + - job: test:unit_tests_pyt(LTS)_mcore(latest) optional: true extends: [.functional_tests_rules] image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID} tags: [mcore-docker-node-small] before_script: - - git rm -r tests/functional_tests/local_recipes || true - - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes - - ls tests/functional_tests/local_recipes + - git rm -r tests/test_utils/local_recipes || true + - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes + - ls tests/test_utils/local_recipes script: - set -x - | @@ -60,7 +48,7 @@ functional:configure: fi - | export PYTHONPATH=$(pwd) - python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \ + python tests/test_utils/scripts/generate_jet_trigger_job.py \ --scope $FUNCTIONAL_TEST_SCOPE \ --environment dev \ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ @@ -70,11 +58,12 @@ functional:configure: --h100-cluster $H100_CLUSTER \ --container-image ${UTILITY_IMAGE} \ --container-tag ${CI_PIPELINE_ID} \ - --output-path "jet-trigger-job-dev.yaml" \ + --dependent-job "functional:configure" \ + --output-path "functional-test-job-dev.yaml" \ ${RELEASE_ARGS[@]} - | export PYTHONPATH=$(pwd) - python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \ + python tests/test_utils/scripts/generate_jet_trigger_job.py \ --scope $FUNCTIONAL_TEST_SCOPE \ --environment lts \ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \ @@ -84,13 +73,14 @@ functional:configure: --h100-cluster $H100_CLUSTER \ --container-image ${UTILITY_IMAGE} \ --container-tag ${CI_PIPELINE_ID} \ - --output-path "jet-trigger-job-lts.yaml" \ + --dependent-job "functional:configure" \ + --output-path "functional-test-job-lts.yaml" \ ${RELEASE_ARGS[@]} artifacts: paths: - - jet-trigger-job-lts.yaml - - jet-trigger-job-dev.yaml - - tests/functional_tests/local_recipes + - functional-test-job-lts.yaml + - functional-test-job-dev.yaml + - tests/test_utils/local_recipes .run: stage: functional_tests @@ -98,7 +88,7 @@ functional:configure: extends: [.functional_tests_rules] trigger: include: - - artifact: jet-trigger-job-$ENVIRONMENT.yaml + - artifact: functional-test-job-$ENVIRONMENT.yaml job: functional:configure strategy: depend variables: diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev index e6073c1713..80a4e04c4f 100644 --- a/Dockerfile.ci.dev +++ b/Dockerfile.ci.dev @@ -27,9 +27,17 @@ COPY --from=build_causal_conv1d /opt/causal_conv1d-*.whl ./ COPY --from=build_grouped_gemm /opt/grouped_gemm-*.whl ./ COPY --from=build_mamba_ssm /opt/mamba_ssm-*.whl ./ -RUN pip install causal_conv1d-*.whl \ - mamba_ssm-*.whl \ - grouped_gemm-*.whl +RUN \ + --mount=type=bind,source=requirements,target=requirements \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + --mount=type=bind,source=setup.py,target=setup.py \ + --mount=type=bind,source=megatron/core/package_info.py,target=megatron/core/package_info.py \ + --mount=type=bind,source=megatron/core/README.md,target=megatron/core/README.md \ + --mount=type=bind,source=megatron/core/__init__.py,target=megatron/core/__init__.py <<"EOF" bash -ex + +pip install causal_conv1d-*.whl mamba_ssm-*.whl grouped_gemm-*.whl +PY_ENV=pytorch:24.07 pip install . +EOF # Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker ARG MCORE_REPO @@ -47,7 +55,7 @@ git checkout $MCORE_REF # Checkout backwards-ref cd /opt -rm -rf /opt/megatron-lm-$MCORE_BACKWARDS_REF; mkdir megatron-lm-$MCORE_BACKWARDS_REF; cd megatron-lm-$MCORE_BACKWARDS_REF +rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy git init git remote add origin ${MCORE_REPO} git fetch origin $MCORE_BACKWARDS_REF diff --git a/Dockerfile.ci.lts b/Dockerfile.ci.lts index af4698dae5..ea0cf31a0b 100644 --- a/Dockerfile.ci.lts +++ b/Dockerfile.ci.lts @@ -28,9 +28,17 @@ COPY --from=build_causal_conv1d /opt/causal_conv1d-*.whl ./ COPY --from=build_grouped_gemm /opt/grouped_gemm-*.whl ./ COPY --from=build_mamba_ssm /opt/mamba_ssm-*.whl ./ -RUN pip install causal_conv1d-*.whl \ - mamba_ssm-*.whl \ - grouped_gemm-*.whl +RUN \ + --mount=type=bind,source=requirements,target=requirements \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + --mount=type=bind,source=setup.py,target=setup.py \ + --mount=type=bind,source=megatron/core/package_info.py,target=megatron/core/package_info.py \ + --mount=type=bind,source=megatron/core/README.md,target=megatron/core/README.md \ + --mount=type=bind,source=megatron/core/__init__.py,target=megatron/core/__init__.py <<"EOF" bash -ex + +pip install causal_conv1d-*.whl mamba_ssm-*.whl grouped_gemm-*.whl +PY_ENV=pytorch:24.07 pip install . +EOF # Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker ARG MCORE_REPO @@ -48,7 +56,7 @@ git checkout $MCORE_REF # Checkout backwards-ref cd /opt -rm -rf /opt/megatron-lm-$MCORE_BACKWARDS_REF; mkdir megatron-lm-$MCORE_BACKWARDS_REF; cd megatron-lm-$MCORE_BACKWARDS_REF +rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy git init git remote add origin ${MCORE_REPO} git fetch origin $MCORE_BACKWARDS_REF @@ -56,10 +64,7 @@ git checkout $MCORE_BACKWARDS_REF rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ EOF -RUN PY_ENV=pytorch:24.01 \ - CAUSAL_CONV1D_FORCE_BUILD=TRUE \ - MAMBA_FORCE_BUILD=TRUE \ - pip install --no-build-isolation -e /opt/megatron-lm +RUN PY_ENV=pytorch:24.01 pip install -e /opt/megatron-lm ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH" ##### For NVIDIANS only ##### diff --git a/tests/functional_tests/python_test_utils/common.py b/tests/functional_tests/python_test_utils/common.py index 32bb200ee6..1b21fa81d5 100644 --- a/tests/functional_tests/python_test_utils/common.py +++ b/tests/functional_tests/python_test_utils/common.py @@ -84,6 +84,9 @@ def read_tb_logs_as_list(path, index=0): def load_expected_data(): expected_metrics_file = os.getenv("EXPECTED_METRICS_FILE") + if expected_metrics_file is None: + raise ValueError("Unknown EXPECTED_METRICS_FILE") + with open(expected_metrics_file) as f: if os.path.exists(expected_metrics_file): with open(expected_metrics_file) as f: diff --git a/tests/functional_tests/jet_recipes/_build-mcore-dev.yaml b/tests/test_utils/recipes/_build-mcore-dev.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/_build-mcore-dev.yaml rename to tests/test_utils/recipes/_build-mcore-dev.yaml diff --git a/tests/functional_tests/jet_recipes/_build-mcore-lts.yaml b/tests/test_utils/recipes/_build-mcore-lts.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/_build-mcore-lts.yaml rename to tests/test_utils/recipes/_build-mcore-lts.yaml diff --git a/tests/functional_tests/jet_recipes/_build-nemo.yaml b/tests/test_utils/recipes/_build-nemo.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/_build-nemo.yaml rename to tests/test_utils/recipes/_build-nemo.yaml diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/test_utils/recipes/bert.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/bert.yaml rename to tests/test_utils/recipes/bert.yaml diff --git a/tests/functional_tests/jet_recipes/gpt-modelopt.yaml b/tests/test_utils/recipes/gpt-modelopt.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/gpt-modelopt.yaml rename to tests/test_utils/recipes/gpt-modelopt.yaml diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/test_utils/recipes/gpt-nemo.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/gpt-nemo.yaml rename to tests/test_utils/recipes/gpt-nemo.yaml diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/gpt.yaml rename to tests/test_utils/recipes/gpt.yaml diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/multimodal-llava.yaml rename to tests/test_utils/recipes/multimodal-llava.yaml diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/test_utils/recipes/t5.yaml similarity index 100% rename from tests/functional_tests/jet_recipes/t5.yaml rename to tests/test_utils/recipes/t5.yaml diff --git a/tests/test_utils/recipes/unit-tests.yaml b/tests/test_utils/recipes/unit-tests.yaml new file mode 100644 index 0000000000..cda58d92ea --- /dev/null +++ b/tests/test_utils/recipes/unit-tests.yaml @@ -0,0 +1,80 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: '{test_case}' + model: unit-tests + nodes: 1 + build: mcore-pyt-{environment} + gpus: 8 + platforms: dgx_h100 + script: |- + ls + + export TAG={tag} + export ENVIRONMENT={environment} + export BUCKET="{test_case}" + export UNIT_TEST_REPEAT={n_repeat} + export UNIT_TEST_TIMEOUT=10 + + set -euxo pipefail + + if [[ "$TAG" == "latest" ]]; then + TEST_PATH="/opt/megatron-lm" + else + TEST_PATH="/opt/megatron-lm-legacy/" + fi + + cd $TEST_PATH + + MARKER=() + if [[ "$TAG" == "legacy" ]]; then + MARKER+=("not internal") + fi + + if [[ "$ENVIRONMENT" == "lts" ]]; then + MARKER+=("not flaky") + fi + + if [[ "$ENVIRONMENT" == "dev" ]]; then + MARKER+=("not flaky_in_dev") + fi + + MARKER_ARG=$(printf "%s" "${{MARKER[0]}}") + for element in "${{MARKER[@]:1}}"; do + MARKER_ARG+=" and $element" + done + + IGNORE_TEST_CASES=$(cat /opt/megatron-lm/tests/test_utils/recipes/unit-tests.yaml | yq eval 'with(.products[].test_case; del(.[] | select(. == env(BUCKET)))) | .products[].test_case[]' | tr " " "\n") + IGNORE_ARGS=() + while IFS= read -r test_case; do + if [[ $test_case == *\** ]]; then + FILES=($(ls $test_case)) + echo ${{FILES[@]}} + for file in "${{FILES[@]}}"; do + IGNORE_ARGS+=("--ignore='$file'") + done + else + IGNORE_ARGS+=("--ignore=$test_case") + fi + done <<< "$IGNORE_TEST_CASES" + + for i in $(seq $UNIT_TEST_REPEAT); do + CMD=$(echo pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail ${{IGNORE_ARGS[@]}} -m "'${{MARKER_ARG}}'" $BUCKET) + eval "$CMD" + done + +products: + - environment: [lts, dev] + tag: [latest, legacy] + scope: [unit-tests] + n_repeat: [1] + time_limit: [1800] + test_case: + - tests/unit_tests/data/ + - tests/unit_tests/dist_checkpointing/*.py + - tests/unit_tests/dist_checkpointing/models/ + - tests/unit_tests/transformer/*.py + - tests/unit_tests/transformer/moe + - tests/unit_tests diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/test_utils/scripts/common.py similarity index 90% rename from tests/functional_tests/python_test_utils/jet/common.py rename to tests/test_utils/scripts/common.py index d11d147866..dd2e2e4706 100644 --- a/tests/functional_tests/python_test_utils/jet/common.py +++ b/tests/test_utils/scripts/common.py @@ -149,6 +149,23 @@ def filter_by_model( return workload_manifests +def filter_by_tag( + workload_manifests: List[jetclient.JETWorkloadManifest], tag: str +) -> List[jetclient.JETWorkloadManifest]: + """Returns all workload with matching tag.""" + workload_manifests = list( + workload_manifest + for workload_manifest in workload_manifests + if hasattr(workload_manifest.spec, "tag") and workload_manifest.spec.tag == tag + ) + + if len(workload_manifests) == 0: + print("No test_case found!") + return [] + + return workload_manifests + + def filter_by_test_cases( workload_manifests: List[jetclient.JETWorkloadManifest], test_cases: str ) -> List[jetclient.JETWorkloadManifest]: @@ -171,6 +188,7 @@ def load_workloads( container_tag: str, n_repeat: int = 1, time_limit: int = 1800, + tag: Optional[str] = None, environment: Optional[str] = None, test_cases: str = "all", scope: Optional[str] = None, @@ -179,8 +197,8 @@ def load_workloads( container_image: Optional[str] = None, ) -> List[jetclient.JETWorkloadManifest]: """Return all workloads from disk that match scope and platform.""" - recipes_dir = BASE_PATH / ".." / ".." / "jet_recipes" - local_dir = BASE_PATH / ".." / ".." / "local_recipes" + recipes_dir = BASE_PATH / ".." / "recipes" + local_dir = BASE_PATH / ".." / "local_recipes" workloads: List[jetclient.JETWorkloadManifest] = [] build_workloads: List[jetclient.JETClient] = [] @@ -198,6 +216,9 @@ def load_workloads( if workloads and model: workloads = filter_by_model(workload_manifests=workloads, model=model) + if workloads and tag: + workloads = filter_by_tag(workload_manifests=workloads, tag=tag) + if workloads and test_cases != "all": workloads = filter_by_test_cases(workload_manifests=workloads, test_cases=test_cases) diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/test_utils/scripts/generate_jet_trigger_job.py similarity index 86% rename from tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py rename to tests/test_utils/scripts/generate_jet_trigger_job.py index cb1fecb3de..ee41cc99be 100644 --- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py +++ b/tests/test_utils/scripts/generate_jet_trigger_job.py @@ -4,7 +4,7 @@ import click import yaml -from tests.functional_tests.python_test_utils.jet import common +from tests.test_utils.scripts import common BASE_PATH = pathlib.Path(__file__).parent.resolve() @@ -20,8 +20,15 @@ @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on") @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on") @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to") -@click.option("--container-image", required=True, type=str, help="LTS Container tag to use") +@click.option("--container-image", required=True, type=str, help="LTS Container image to use") @click.option("--container-tag", required=True, type=str, help="Container tag to use") +@click.option( + "--dependent-job", + required=True, + type=str, + help="Name of job that created the downstream pipeline", +) +@click.option("--tag", required=False, type=str, help="Tag (only relevant for unit tests)") @click.option( "--run-name", required=False, type=str, help="Run name (only relevant for release tests)" ) @@ -42,13 +49,19 @@ def main( output_path: str, container_image: str, container_tag: str, + dependent_job: str, + tag: Optional[str] = None, run_name: Optional[str] = None, wandb_experiment: Optional[str] = None, ): list_of_test_cases = [ test_case for test_case in common.load_workloads( - scope=scope, container_tag=container_tag, environment=environment, test_cases=test_cases + scope=scope, + container_tag=container_tag, + environment=environment, + test_cases=test_cases, + tag=tag, ) if test_case.type != "build" ] @@ -103,16 +116,19 @@ def main( script = [ "export PYTHONPATH=$(pwd); " - "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py", + "python tests/test_utils/scripts/launch_jet_workload.py", f"--model {test_case.spec.model}", f"--environment {test_case.spec.environment}", f"--n-repeat {n_repeat}", f"--time-limit {time_limit}", - f"--test-case {test_case.spec.test_case}", + f"--test-case '{test_case.spec.test_case}'", f"--container-tag {container_tag}", f"--cluster {cluster}", ] + if tag is not None: + script.append(f"--tag {tag}") + if run_name is not None and wandb_experiment is not None: script.append(f"--run-name {run_name}") test_case.spec.model @@ -129,7 +145,7 @@ def main( {"if": '$CI_MERGE_REQUEST_ID'}, ], "timeout": "7 days", - "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}], + "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": dependent_job}], "script": [" ".join(script)], "artifacts": {"paths": ["results/"], "when": "always"}, } diff --git a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py b/tests/test_utils/scripts/generate_local_jobs.py similarity index 96% rename from tests/functional_tests/python_test_utils/jet/generate_local_jobs.py rename to tests/test_utils/scripts/generate_local_jobs.py index 4a40bd8ab6..ebb3e5b5f9 100644 --- a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py +++ b/tests/test_utils/scripts/generate_local_jobs.py @@ -12,7 +12,7 @@ import jetclient import yaml -from tests.functional_tests.python_test_utils.jet import common +from tests.test_utils.scripts import common def load_script(config_path: str) -> str: diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/test_utils/scripts/launch_jet_workload.py similarity index 88% rename from tests/functional_tests/python_test_utils/jet/launch_jet_workload.py rename to tests/test_utils/scripts/launch_jet_workload.py index 03ef71ced0..5663d3ef0f 100644 --- a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py +++ b/tests/test_utils/scripts/launch_jet_workload.py @@ -16,7 +16,7 @@ from jetclient.facades.objects import log as jet_log from jetclient.services.dtos.pipeline import PipelineStatus -from tests.functional_tests.python_test_utils.jet import common +from tests.test_utils.scripts import common BASE_PATH = pathlib.Path(__file__).parent.resolve() @@ -41,6 +41,7 @@ def launch_and_wait_for_completion( container_tag: str, cluster: str, account: str, + tag: Optional[str], run_name: Optional[str], wandb_experiment: Optional[str], ) -> jetclient.JETPipeline: @@ -54,6 +55,7 @@ def launch_and_wait_for_completion( test_case=test_case, n_repeat=n_repeat, time_limit=time_limit, + tag=tag, container_image=container_image, container_tag=container_tag, environment=environment, @@ -94,7 +96,7 @@ def launch_and_wait_for_completion( n_wait_attempts = 0 while n_wait_attempts < 3: try: - pipeline.wait(max_wait_time=60 * 60 * 24 * 7, interval=60 * 3) + pipeline.wait(max_wait_time=60 * 60 * 24 * 7, interval=60 * 1) break except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as e: print(e) @@ -169,6 +171,7 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]: @click.option("--cluster", required=True, type=str, help="Cluster to run on") @click.option("--container-tag", required=True, type=str, help="Base image of Mcore image") @click.option("--container-image", required=False, type=str, help="Base image of Mcore image") +@click.option("--tag", required=False, type=str, help="Tag (only relevant for unit tests)") @click.option( "--run-name", required=False, type=str, help="Run name (only relevant for release tests)" ) @@ -187,22 +190,25 @@ def main( account: str, cluster: str, container_tag: str, + tag: Optional[str] = None, container_image: Optional[str] = None, run_name: Optional[str] = None, wandb_experiment: Optional[str] = None, ): + model_config_path = pathlib.Path( + BASE_PATH / ".." / ".." / "test_cases" / model / test_case / "model_config.yaml" + ) - with open( - pathlib.Path( - BASE_PATH / ".." / ".." / "test_cases" / model / test_case / "model_config.yaml" - ) - ) as stream: - try: - test_case_dict = yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) + if model_config_path.exists(): + with open(model_config_path) as stream: + try: + test_case_dict = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) - test_type = test_case_dict['TEST_TYPE'] + test_type = test_case_dict['TEST_TYPE'] + else: + test_type = "unit_test" if test_type == "release" and (run_name is None or wandb_experiment is None): print(f"Not all arguments provided ({run_name=}, {wandb_experiment=})") @@ -221,6 +227,7 @@ def main( container_tag=container_tag, cluster=cluster, account=account, + tag=tag, run_name=run_name, wandb_experiment=wandb_experiment, ) @@ -242,9 +249,19 @@ def main( concat_logs = "\n".join(logs) print(f"Logs:\n{concat_logs}") - if test_type != "release": - success = pipeline.get_status() == PipelineStatus.SUCCESS + success = pipeline.get_status() == PipelineStatus.SUCCESS + + if test_type == "unit_test": + success = success and ( + ( + re.search(r'=.*?\bpassed\b.*?=', concat_logs) + and not re.search(r'=.*?\bfailed\b.*?=', concat_logs) + ) + or "0 selected" in concat_logs + ) + sys.exit(int(not success)) # invert for exit 0 + if test_type != "release": if success: sys.exit(int(not success)) # invert for exit 0 diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index 8fb1c3f99a..f166a8179d 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -1,18 +1,27 @@ -import gc import os -import sys from pathlib import Path -from unittest import mock import pytest import torch +import torch.distributed -from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy from megatron.core.utils import is_te_min_version from tests.unit_tests.dist_checkpointing import TempNamedDir from tests.unit_tests.test_utilities import Utils +def pytest_sessionfinish(session, exitstatus): + if exitstatus == 5: + session.exitstatus = 0 + + +@pytest.fixture(scope="session", autouse=True) +def cleanup(): + yield + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + @pytest.fixture(scope="function", autouse=True) def set_env(): if is_te_min_version("1.3"): diff --git a/tests/unit_tests/dist_checkpointing/conftest.py b/tests/unit_tests/dist_checkpointing/conftest.py index 83cbc684fd..3702ac5edf 100644 --- a/tests/unit_tests/dist_checkpointing/conftest.py +++ b/tests/unit_tests/dist_checkpointing/conftest.py @@ -5,6 +5,11 @@ from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy +def pytest_sessionfinish(session, exitstatus): + if exitstatus == 5: + session.exitstatus = 0 + + @pytest.fixture(scope='session', autouse=True) def set_default_dist_ckpt_strategy(): def get_pyt_dist_save_sharded_strategy(): diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py index e72304dfe5..5ff2a682a0 100644 --- a/tests/unit_tests/distributed/test_param_and_grad_buffer.py +++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py @@ -156,6 +156,7 @@ def _pad_param_if_needed(numel_unpadded): @pytest.mark.parametrize("use_distributed_optimizer", [False, True]) @pytest.mark.parametrize("overlap_grad_reduce", [False, True]) +@pytest.mark.flaky def test_grad_sync(use_distributed_optimizer: bool, overlap_grad_reduce: bool): Utils.initialize_model_parallel() diff --git a/tests/unit_tests/test_inference.py b/tests/unit_tests/test_inference.py index 2124826c56..bf70bf298f 100644 --- a/tests/unit_tests/test_inference.py +++ b/tests/unit_tests/test_inference.py @@ -53,6 +53,8 @@ def client(app): @unittest.mock.patch('megatron.inference.text_generation.communication.mpu') @unittest.mock.patch('megatron.inference.text_generation.generation.ForwardStep') @unittest.mock.patch('megatron.inference.text_generation.tokenization.get_tokenizer') +@pytest.mark.flaky +@pytest.mark.flaky_in_dev def test_completions( mock_get_tokenizer1, mock_forward_step, diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py index bb834a9661..96afe46e9a 100644 --- a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py @@ -7,6 +7,12 @@ from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestContainer +def test_placeholder(): + """This is here because otherwise there's no other test in this module (all disabled) and pytest would fail.""" + pass + + +@pytest.mark.flaky class TestAlltoAllDispatcher: def setup_method(self, method): pass @@ -18,6 +24,8 @@ def teardown_method(self, method): @pytest.mark.internal @pytest.mark.timeout(120) @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)]) + @pytest.mark.flaky + @pytest.mark.flaky_in_dev def test_forward_backward(self, tp_size, ep_size): container = MoEModelTestContainer( tp_size=tp_size, @@ -34,6 +42,8 @@ def test_forward_backward(self, tp_size, ep_size): @pytest.mark.internal @pytest.mark.timeout(120) @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)]) + @pytest.mark.flaky + @pytest.mark.flaky_in_dev def test_a2aseq_forward_backward(self, tp_size, ep_size): container = MoEModelTestContainer( tp_size=tp_size, @@ -50,6 +60,8 @@ def test_a2aseq_forward_backward(self, tp_size, ep_size): @pytest.mark.internal @pytest.mark.timeout(120) @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)]) + @pytest.mark.flaky + @pytest.mark.flaky_in_dev def test_capacity_forward_backward(self, tp_size, ep_size): container = MoEModelTestContainer( tp_size=tp_size, @@ -69,6 +81,8 @@ def test_capacity_forward_backward(self, tp_size, ep_size): @pytest.mark.internal @pytest.mark.timeout(120) @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)]) + @pytest.mark.flaky + @pytest.mark.flaky_in_dev def test_capacity_padding_forward_backward(self, tp_size, ep_size): container = MoEModelTestContainer( tp_size=tp_size, diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py index 6bf79bbe7e..895cb291aa 100644 --- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -231,6 +231,8 @@ def teardown_method(self, method): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.internal @pytest.mark.parametrize("tp_size,ep_size", [(8, 1), (1, 8), (2, 4), (1, 1)]) + @pytest.mark.flaky + @pytest.mark.flaky_in_dev def test_forward_backward(self, tp_size, ep_size): container = MoEModelTestContainer( tp_size=tp_size, @@ -250,6 +252,8 @@ def test_forward_backward(self, tp_size, ep_size): @pytest.mark.parametrize( "tp_size,ep_size,moe_tp_size", [(1, 1, 8), (1, 2, 4), (1, 4, 2), (2, 2, 4)] ) + @pytest.mark.flaky + @pytest.mark.flaky_in_dev def test_moe_tp_forward_backward(self, tp_size, ep_size, moe_tp_size): container = MoEModelTestContainer( tp_size=tp_size, diff --git a/unit-test-job-lts.yaml b/unit-test-job-lts.yaml new file mode 100644 index 0000000000..fd6eb71dfe --- /dev/null +++ b/unit-test-job-lts.yaml @@ -0,0 +1,107 @@ +default: + interruptible: true +other: + artifacts: + paths: + - results/ + when: always + image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 + needs: + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID + rules: + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID + script: + - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + other --container-tag 20283570 --cluster dgxh100_coreweave + stage: unit-tests + tags: &id001 + - arch/amd64 + - env/prod + - origin/jet-fleet + - owner/jet-core + - purpose/jet-client + - team/megatron + timeout: 7 days +stages: +- unit-tests +tests/unit_tests/data/: + artifacts: + paths: + - results/ + when: always + image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 + needs: + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID + rules: + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID + script: + - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/data/ --container-tag 20283570 --cluster dgxh100_coreweave + stage: unit-tests + tags: *id001 + timeout: 7 days +tests/unit_tests/dist_checkpointing/: + artifacts: + paths: + - results/ + when: always + image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 + needs: + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID + rules: + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID + script: + - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/dist_checkpointing/ --container-tag 20283570 --cluster dgxh100_coreweave + stage: unit-tests + tags: *id001 + timeout: 7 days +tests/unit_tests/distributed/: + artifacts: + paths: + - results/ + when: always + image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 + needs: + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID + rules: + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID + script: + - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/distributed/ --container-tag 20283570 --cluster dgxh100_coreweave + stage: unit-tests + tags: *id001 + timeout: 7 days +? tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py + tests/unit_tests/test_training.py +: artifacts: + paths: + - results/ + when: always + image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_utility:20283570 + needs: + - job: functional:configure + pipeline: $PARENT_PIPELINE_ID + rules: + - if: $CI_PIPELINE_SOURCE == "parent_pipeline" + - if: $CI_MERGE_REQUEST_ID + script: + - export PYTHONPATH=$(pwd); python tests/test_utils/scripts/launch_jet_workload.py + --model unit-tests --environment lts --n-repeat 1 --time-limit 1800 --test-case + tests/unit_tests/test_inference.py tests/unit_tests/test_tokenizer.py tests/unit_tests/test_utilities.py + tests/unit_tests/test_training.py --container-tag 20283570 --cluster dgxh100_coreweave + stage: unit-tests + tags: *id001 + timeout: 7 days