Skip to content

Commit

Permalink
Merge pull request #37 from ROCm/ifu_update_dec11
Browse files Browse the repository at this point in the history
Ifu update dec11
  • Loading branch information
wenchenvincent authored Jan 24, 2025
2 parents e35b94b + af673a5 commit f217adc
Show file tree
Hide file tree
Showing 672 changed files with 176,812 additions and 14,643 deletions.
129 changes: 79 additions & 50 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,100 +10,129 @@ workflow:
- if: $CI_PIPELINE_SOURCE == "web"
- if: $CI_COMMIT_REF_PROTECTED == "true"
variables:
FUNCTIONAL_TEST: "no"
FUNCTIONAL_TEST: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 50
FUNCTIONAL_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 15
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 50
FUNCTIONAL_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 15
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 50
FUNCTIONAL_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 15
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 9000
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
FUNCTIONAL_TEST: "no"
FUNCTIONAL_TEST: 'no'
PUBLISH: 'no'
- when: never
auto_cancel:
on_new_commit: interruptible
# on_job_failure: all

stages:
- test
- test
- functional_tests
- convergence_tests
- publish

default:
interruptible: true

variables:
FUNCTIONAL_TEST:
value: "yes"
UNIT_TEST:
value: 'yes'
options:
- "yes"
- "no"
- 'yes'
- 'no'
description: To run the funtional test suite
UNIT_TEST_REPEAT:
value: '1'
description: 'Number of repetitions'
UNIT_TEST_TIMEOUT:
value: '30'
description: Timeout (minutes) for Unit tests (all repeats)
FUNCTIONAL_TEST:
value: 'yes'
options:
- 'yes'
- 'no'
description: To run the funtional test suite
FUNCTIONAL_TEST_SCOPE:
value: "mr"
value: 'mr'
options:
- "mr"
- "nightly"
- "weekly"
- "pre-release"
- "release"
description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
- 'mr'
- 'nightly'
- 'weekly'
- 'pre-release'
- 'release'
description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)'
FUNCTIONAL_TEST_REPEAT:
value: '5'
description: 'Number of repetitions per test'
FUNCTIONAL_TEST_TIME_LIMIT:
value: '2700'
description: 'Timeout in seconds per test'
FUNCTIONAL_TEST_CASES:
value: 'all'
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_CLUSTER_A100:
value: "dgxa100_dracooci"
value: 'dgxa100_dracooci'
options:
- "dgxa100_dracooci"
- "dgxa100_dracooci-ord"
- 'dgxa100_dracooci'
- 'dgxa100_dracooci-ord'
description: 'Cluster for A100 workloads'
FUNCTIONAL_TEST_CLUSTER_H100:
value: "dgxh100_eos"
value: 'dgxh100_eos'
options:
- "dgxh100_coreweave"
- "dgxh100_eos"
- 'dgxh100_coreweave'
- 'dgxh100_eos'
description: 'Cluster for H100 workloads'
FUNCTIONAL_TEST_NAME:
description: "Name of functional test run (only for pre-release and release)"
PUBLISH:
value: "no"
options:
- "yes"
- "no"
description: 'Name of functional test run (only for pre-release and release)'
PUBLISH:
value: 'no'
options:
- 'yes'
- 'no'
description: Build and publish a wheel to PyPi
PUBLISH_SCOPE:
value: "code-freeze"
value: 'code-freeze'
options:
- "code-freeze"
- "release"
- 'code-freeze'
- 'release'
description: Type of publish (freeze or final release)

# CI wide variables
CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci
CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
UNIT_TEST_TIMEOUT: 15
UNIT_TEST_REPEAT: 1
UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility

include:
- .gitlab/stages/00.pre.yml
- .gitlab/stages/01.tests.yml
- .gitlab/stages/01.test.yml
- .gitlab/stages/02.functional-tests.yml
- .gitlab/stages/03.publish.yml
6 changes: 4 additions & 2 deletions .gitlab/labeler-config.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
CI:
- .gitlab-ci.yml
- Dockerfile.ci
- jet-tests.yml
- Dockerfile.ci.lts
- Dockerfile.ci.dev
- .github/**
- .gitlab/**

Datasets:
- megatron/core/datasets/**
Expand Down
72 changes: 31 additions & 41 deletions .gitlab/stages/00.pre.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include:
- template: Security/Secret-Detection.gitlab-ci.yml

.pre_mr_rules:
.pre_rules:
rules:
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
Expand All @@ -10,45 +10,53 @@ include:
- when: never
stage: .pre

mirror_to_github:
.dind_rules:
image: docker:26.1.4-dind
variables:
DOCKER_HOST: unix:///var/run/docker.sock
before_script:
- docker system prune -a --filter "until=36h" -f || true
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin

pre:mirror_to_github:
rules:
- if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
- when: never
tags: [mcore-docker-node-small]
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
GIT_STRATEGY: 'clone'
script:
- git checkout $CI_COMMIT_BRANCH
- git remote add github https://ko3n1g:$GH_TOKEN@github.com/NVIDIA/Megatron-LM.git || true
- git push -u github $CI_COMMIT_BRANCH

create_ci_branches:
pre:create_ci_branches:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
- when: never
parallel:
matrix:
- branch: ci-unit-test-extended
- branch: ci-rebuild-mcore-nemo-image
- branch: ci-mr-a100
- branch: ci-nightly-a100
- branch: ci-weekly-a100
- branch: ci-weekly-h100
- branch: ci-mr
- branch: ci-nightly
- branch: ci-weekly
- branch: ci-pre-release
tags: [mcore-docker-node-small]
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
GIT_STRATEGY: 'clone'
script:
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
- git switch --force-create $branch
- git push --force -u origin $branch

label_merge_request:
extends: [.pre_mr_rules]
pre:label_merge_request:
extends: [.pre_rules]
image: golang:1.22
tags:
- mcore-docker-node-small
Expand All @@ -67,37 +75,21 @@ label_merge_request:
source labels
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
clean_docker_node:
extends: [.pre_mr_rules]
image: docker:26.1.4-dind
tags:
- ${node}
parallel:
matrix:
- node: 8xL40S
- node: mcore-docker-node-small
- node: mcore-docker-node-jet
script:
- export DOCKER_HOST='unix:///var/run/docker.sock'
- docker system prune -a --filter "until=36h" -f || true

maybe_cherry_pick_commit:
pre:maybe_cherry_pick_commit:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
- when: never
tags: [mcore-docker-node-small]
stage: .pre
image:
name: registry.gitlab.com/gitlab-ci-utils/curl-jq
entrypoint: [""]
image: badouralix/curl-jq
variables:
GIT_STRATEGY: "clone"
script:
GIT_STRATEGY: 'clone'
script:
- set -x
- set +e
- SHA=$(git rev-list --no-merges -n 1 HEAD)
- MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
- MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
- MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- git config --global user.email "mcore-bot@nvidia.com"
- git config --global user.name "Mcore Bot"
Expand All @@ -115,10 +107,10 @@ maybe_cherry_pick_commit:
echo Nothing to cherry pick
exit 0
fi
echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
echo Release branch does not yet exist, will not cherry-pick
continue
Expand Down Expand Up @@ -155,7 +147,7 @@ maybe_cherry_pick_commit:
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed"
"text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'"
}
}
]
Expand All @@ -168,11 +160,10 @@ maybe_cherry_pick_commit:
done
interruptible: false

check_milestone:
extends: [.pre_mr_rules]
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
tags:
- mcore-docker-node-small
pre:check_milestone:
extends: [.pre_rules]
image: badouralix/curl-jq
tags: [mcore-docker-node-small]
script:
- env
- |
Expand All @@ -182,4 +173,3 @@ check_milestone:
echo Please assign a Milestone to this MR!
exit 1
fi
Loading

0 comments on commit f217adc

Please sign in to comment.