diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c99b97f697..b24e9dd0b7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,100 +10,129 @@ workflow:
- if: $CI_PIPELINE_SOURCE == "web"
- if: $CI_COMMIT_REF_PROTECTED == "true"
variables:
- FUNCTIONAL_TEST: "no"
+ FUNCTIONAL_TEST: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
- UNIT_TEST_REPEAT: 5
- UNIT_TEST_TIMEOUT: 50
- FUNCTIONAL_TEST: "yes"
+ UNIT_TEST_REPEAT: 1
+ UNIT_TEST_TIMEOUT: 15
+ FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: mr
- FUNCTIONAL_TEST_CLUSTER_A100: ""
- FUNCTIONAL_TEST_CLUSTER_H100: ""
+ FUNCTIONAL_TEST_REPEAT: 5
+ FUNCTIONAL_TEST_TIME_LIMIT: 2700
+ FUNCTIONAL_TEST_CLUSTER_A100: ''
+ FUNCTIONAL_TEST_CLUSTER_H100: ''
+ PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
- UNIT_TEST_REPEAT: 5
- UNIT_TEST_TIMEOUT: 50
- FUNCTIONAL_TEST: "yes"
+ UNIT_TEST_REPEAT: 1
+ UNIT_TEST_TIMEOUT: 15
+ FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: nightly
- FUNCTIONAL_TEST_CLUSTER_A100: ""
- FUNCTIONAL_TEST_CLUSTER_H100: ""
+ FUNCTIONAL_TEST_REPEAT: 5
+ FUNCTIONAL_TEST_TIME_LIMIT: 2700
+ FUNCTIONAL_TEST_CLUSTER_A100: ''
+ FUNCTIONAL_TEST_CLUSTER_H100: ''
+ PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
- UNIT_TEST_REPEAT: 5
- UNIT_TEST_TIMEOUT: 50
- FUNCTIONAL_TEST: "yes"
+ UNIT_TEST_REPEAT: 1
+ UNIT_TEST_TIMEOUT: 15
+ FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: weekly
- FUNCTIONAL_TEST_CLUSTER_A100: ""
- FUNCTIONAL_TEST_CLUSTER_H100: ""
+ FUNCTIONAL_TEST_REPEAT: 1
+ FUNCTIONAL_TEST_TIME_LIMIT: 9000
+ FUNCTIONAL_TEST_CLUSTER_A100: ''
+ FUNCTIONAL_TEST_CLUSTER_H100: ''
+ PUBLISH: 'no'
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
- FUNCTIONAL_TEST: "no"
+ FUNCTIONAL_TEST: 'no'
+ PUBLISH: 'no'
- when: never
auto_cancel:
on_new_commit: interruptible
+ # on_job_failure: all
stages:
- - test
+ - test
- functional_tests
- - convergence_tests
- publish
default:
interruptible: true
variables:
- FUNCTIONAL_TEST:
- value: "yes"
+ UNIT_TEST:
+ value: 'yes'
options:
- - "yes"
- - "no"
+ - 'yes'
+ - 'no'
+ description: To run the funtional test suite
+ UNIT_TEST_REPEAT:
+ value: '1'
+ description: 'Number of repetitions'
+ UNIT_TEST_TIMEOUT:
+ value: '30'
+ description: Timeout (minutes) for Unit tests (all repeats)
+ FUNCTIONAL_TEST:
+ value: 'yes'
+ options:
+ - 'yes'
+ - 'no'
description: To run the funtional test suite
FUNCTIONAL_TEST_SCOPE:
- value: "mr"
+ value: 'mr'
options:
- - "mr"
- - "nightly"
- - "weekly"
- - "pre-release"
- - "release"
- description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
+ - 'mr'
+ - 'nightly'
+ - 'weekly'
+ - 'pre-release'
+ - 'release'
+ description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)'
+ FUNCTIONAL_TEST_REPEAT:
+ value: '5'
+ description: 'Number of repetitions per test'
+ FUNCTIONAL_TEST_TIME_LIMIT:
+ value: '2700'
+ description: 'Timeout in seconds per test'
+ FUNCTIONAL_TEST_CASES:
+ value: 'all'
+ description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_CLUSTER_A100:
- value: "dgxa100_dracooci"
+ value: 'dgxa100_dracooci'
options:
- - "dgxa100_dracooci"
- - "dgxa100_dracooci-ord"
+ - 'dgxa100_dracooci'
+ - 'dgxa100_dracooci-ord'
description: 'Cluster for A100 workloads'
FUNCTIONAL_TEST_CLUSTER_H100:
- value: "dgxh100_eos"
+ value: 'dgxh100_eos'
options:
- - "dgxh100_coreweave"
- - "dgxh100_eos"
+ - 'dgxh100_coreweave'
+ - 'dgxh100_eos'
description: 'Cluster for H100 workloads'
FUNCTIONAL_TEST_NAME:
- description: "Name of functional test run (only for pre-release and release)"
- PUBLISH:
- value: "no"
- options:
- - "yes"
- - "no"
+ description: 'Name of functional test run (only for pre-release and release)'
+ PUBLISH:
+ value: 'no'
+ options:
+ - 'yes'
+ - 'no'
description: Build and publish a wheel to PyPi
PUBLISH_SCOPE:
- value: "code-freeze"
+ value: 'code-freeze'
options:
- - "code-freeze"
- - "release"
+ - 'code-freeze'
+ - 'release'
description: Type of publish (freeze or final release)
# CI wide variables
- CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci
+ CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
- LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
- UNIT_TEST_TIMEOUT: 15
- UNIT_TEST_REPEAT: 1
+ UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
include:
- .gitlab/stages/00.pre.yml
- - .gitlab/stages/01.tests.yml
+ - .gitlab/stages/01.test.yml
- .gitlab/stages/02.functional-tests.yml
- .gitlab/stages/03.publish.yml
diff --git a/.gitlab/labeler-config.yml b/.gitlab/labeler-config.yml
index 2577c2b929..3dc4001cd7 100644
--- a/.gitlab/labeler-config.yml
+++ b/.gitlab/labeler-config.yml
@@ -1,7 +1,9 @@
CI:
- .gitlab-ci.yml
-- Dockerfile.ci
-- jet-tests.yml
+- Dockerfile.ci.lts
+- Dockerfile.ci.dev
+- .github/**
+- .gitlab/**
Datasets:
- megatron/core/datasets/**
diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index a91436be87..65564cf884 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -1,7 +1,7 @@
include:
- template: Security/Secret-Detection.gitlab-ci.yml
-.pre_mr_rules:
+.pre_rules:
rules:
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
@@ -10,7 +10,16 @@ include:
- when: never
stage: .pre
-mirror_to_github:
+.dind_rules:
+ image: docker:26.1.4-dind
+ variables:
+ DOCKER_HOST: unix:///var/run/docker.sock
+ before_script:
+ - docker system prune -a --filter "until=36h" -f || true
+ - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+ - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
+
+pre:mirror_to_github:
rules:
- if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
- when: never
@@ -18,13 +27,13 @@ mirror_to_github:
stage: .pre
image: python:3.10
variables:
- GIT_STRATEGY: "clone"
+ GIT_STRATEGY: 'clone'
script:
- git checkout $CI_COMMIT_BRANCH
- git remote add github https://ko3n1g:$GH_TOKEN@github.com/NVIDIA/Megatron-LM.git || true
- git push -u github $CI_COMMIT_BRANCH
-create_ci_branches:
+pre:create_ci_branches:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
- when: never
@@ -32,23 +41,22 @@ create_ci_branches:
matrix:
- branch: ci-unit-test-extended
- branch: ci-rebuild-mcore-nemo-image
- - branch: ci-mr-a100
- - branch: ci-nightly-a100
- - branch: ci-weekly-a100
- - branch: ci-weekly-h100
+ - branch: ci-mr
+ - branch: ci-nightly
+ - branch: ci-weekly
- branch: ci-pre-release
tags: [mcore-docker-node-small]
stage: .pre
image: python:3.10
variables:
- GIT_STRATEGY: "clone"
+ GIT_STRATEGY: 'clone'
script:
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
- git switch --force-create $branch
- git push --force -u origin $branch
-label_merge_request:
- extends: [.pre_mr_rules]
+pre:label_merge_request:
+ extends: [.pre_rules]
image: golang:1.22
tags:
- mcore-docker-node-small
@@ -67,37 +75,21 @@ label_merge_request:
source labels
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
-clean_docker_node:
- extends: [.pre_mr_rules]
- image: docker:26.1.4-dind
- tags:
- - ${node}
- parallel:
- matrix:
- - node: 8xL40S
- - node: mcore-docker-node-small
- - node: mcore-docker-node-jet
- script:
- - export DOCKER_HOST='unix:///var/run/docker.sock'
- - docker system prune -a --filter "until=36h" -f || true
-
-maybe_cherry_pick_commit:
+pre:maybe_cherry_pick_commit:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
- when: never
tags: [mcore-docker-node-small]
stage: .pre
- image:
- name: registry.gitlab.com/gitlab-ci-utils/curl-jq
- entrypoint: [""]
+ image: badouralix/curl-jq
variables:
- GIT_STRATEGY: "clone"
- script:
+ GIT_STRATEGY: 'clone'
+ script:
- set -x
- set +e
- SHA=$(git rev-list --no-merges -n 1 HEAD)
- MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
- - MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
+ - MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- git config --global user.email "mcore-bot@nvidia.com"
- git config --global user.name "Mcore Bot"
@@ -115,10 +107,10 @@ maybe_cherry_pick_commit:
echo Nothing to cherry pick
exit 0
fi
-
+
echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
-
+
if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
echo Release branch does not yet exist, will not cherry-pick
continue
@@ -155,7 +147,7 @@ maybe_cherry_pick_commit:
"type": "section",
"text": {
"type": "mrkdwn",
- "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed"
+ "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'"
}
}
]
@@ -168,11 +160,10 @@ maybe_cherry_pick_commit:
done
interruptible: false
-check_milestone:
- extends: [.pre_mr_rules]
- image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
- tags:
- - mcore-docker-node-small
+pre:check_milestone:
+ extends: [.pre_rules]
+ image: badouralix/curl-jq
+ tags: [mcore-docker-node-small]
script:
- env
- |
@@ -182,4 +173,3 @@ check_milestone:
echo Please assign a Milestone to this MR!
exit 1
fi
-
\ No newline at end of file
diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml
new file mode 100644
index 0000000000..f387e26f72
--- /dev/null
+++ b/.gitlab/stages/01.test.yml
@@ -0,0 +1,485 @@
+.test_rules:
+ rules:
+ - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+ allow_failure: true
+ when: on_success
+ - when: on_success
+ stage: test
+
+include:
+ - template: Security/Secret-Detection.gitlab-ci.yml
+
+test:build_image:
+ extends: [.test_rules, .dind_rules]
+ tags:
+ - arch/amd64
+ - origin/jet-fleet
+ - env/prod
+ - ${TAG}
+ services:
+ - name: docker:24.0.5-dind
+ variables:
+ HEALTHCHECK_TCP_PORT: '2376'
+ timeout: 45m
+ parallel:
+ matrix:
+ - IMAGE: CI_MCORE_LTS_IMAGE
+ FILE: Dockerfile.ci.lts
+ BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
+ - IMAGE: CI_MCORE_DEV_IMAGE
+ FILE: Dockerfile.ci.dev
+ BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3
+ - IMAGE: CI_NEMO_IMAGE
+ FILE: Dockerfile.ci.lts
+ BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
+ - IMAGE: UTILITY_IMAGE
+ FILE: Dockerfile.linting
+ BASE_IMAGE: python:3.10
+ variables:
+ DOCKER_HOST: tcp://docker:2376
+ DOCKER_TLS_CERTDIR: '/certs'
+ DOCKER_TLS_VERIFY: 1
+ DOCKER_CERT_PATH: '$DOCKER_TLS_CERTDIR/client'
+ TAG: purpose/builder-large
+ STAGE: jet
+ script:
+ - apk add bash
+ - |
+ bash -c '
+ set -x
+ env
+ eval "IMAGE=\$$IMAGE"
+
+ docker context create tls-environment
+ docker buildx create --name container --driver=docker-container --use tls-environment
+
+ ADDITIONAL_PARAMS=()
+
+ if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" ]]; then
+ ADDITIONAL_PARAMS+=("--pull")
+ ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
+ fi
+
+ if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then
+ ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
+ fi
+
+ echo $(git rev-parse HEAD)
+
+ DOCKER_BUILDKIT=1 docker build \
+ --secret id=JET_INDEX_URLS \
+ --target $STAGE \
+ -f $FILE \
+ -t ${IMAGE}:${CI_PIPELINE_ID} \
+ --builder=container \
+ --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
+ --build-arg MCORE_REPO=${CI_REPOSITORY_URL} \
+ --build-arg MCORE_REF=$CI_COMMIT_SHA \
+ --build-arg MCORE_BACKWARDS_REF="core_r0.9.0" \
+ --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+ --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+ --cache-from type=registry,ref=${IMAGE}-buildcache:main \
+ --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+ --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+ --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
+ --push \
+ ${ADDITIONAL_PARAMS[@]} .
+ '
+ retry:
+ max: 2
+
+test:unit_tests_configure:
+ extends: [.test_rules]
+ needs:
+ - test:build_image
+ image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+ tags: [mcore-docker-node-small]
+ before_script:
+ - git rm -r tests/test_utils/local_recipes || true
+ - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
+ - ls tests/test_utils/local_recipes
+ script:
+ - set -x
+ - |
+ A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
+ H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
+ - |
+ export PYTHONPATH=$(pwd)
+ python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+ --scope "unit-tests" \
+ --environment lts \
+ --n-repeat "${UNIT_TEST_REPEAT}" \
+ --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
+ --test-cases "all" \
+ --a100-cluster "dgxa100_dracooci-ord" \
+ --h100-cluster "dgxh100_coreweave" \
+ --container-image ${UTILITY_IMAGE} \
+ --container-tag ${CI_PIPELINE_ID} \
+ --dependent-job "test:unit_tests_configure" \
+ --tag "legacy" \
+ --output-path "unit-test-job-lts-legacy.yaml"
+ - |
+ export PYTHONPATH=$(pwd)
+ python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+ --scope "unit-tests" \
+ --environment lts \
+ --n-repeat "${UNIT_TEST_REPEAT}" \
+ --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
+ --test-cases "all" \
+ --a100-cluster "dgxa100_dracooci-ord" \
+ --h100-cluster "dgxh100_coreweave" \
+ --container-image ${UTILITY_IMAGE} \
+ --container-tag ${CI_PIPELINE_ID} \
+ --dependent-job "test:unit_tests_configure" \
+ --tag "latest" \
+ --output-path "unit-test-job-lts-latest.yaml"
+ - |
+ export PYTHONPATH=$(pwd)
+ python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+ --scope "unit-tests" \
+ --environment dev \
+ --n-repeat "${UNIT_TEST_REPEAT}" \
+ --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
+ --test-cases "all" \
+ --a100-cluster "dgxa100_dracooci-ord" \
+ --h100-cluster "dgxh100_coreweave" \
+ --container-image ${UTILITY_IMAGE} \
+ --container-tag ${CI_PIPELINE_ID} \
+ --dependent-job "test:unit_tests_configure" \
+ --tag "legacy" \
+ --output-path "unit-test-job-dev-legacy.yaml"
+ - |
+ export PYTHONPATH=$(pwd)
+ python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+ --scope "unit-tests" \
+ --environment dev \
+ --n-repeat "${UNIT_TEST_REPEAT}" \
+ --time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
+ --test-cases "all" \
+ --a100-cluster "dgxa100_dracooci-ord" \
+ --h100-cluster "dgxh100_coreweave" \
+ --container-image ${UTILITY_IMAGE} \
+ --container-tag ${CI_PIPELINE_ID} \
+ --dependent-job "test:unit_tests_configure" \
+ --tag "latest" \
+ --output-path "unit-test-job-dev-latest.yaml"
+
+ artifacts:
+ paths:
+ - unit-test-job-dev-legacy.yaml
+ - unit-test-job-dev-latest.yaml
+ - unit-test-job-lts-legacy.yaml
+ - unit-test-job-lts-latest.yaml
+ - tests/test_utils/local_recipes
+
+.unit_tests_run:
+ needs:
+ - test:formatting
+ - test:copyright
+ - job: test:secret_detection
+ optional: true
+ - test:unit_tests_configure
+ extends: [.test_rules]
+ trigger:
+ include:
+ - artifact: unit-test-job-$ENVIRONMENT-$TAG.yaml
+ job: test:unit_tests_configure
+ strategy: depend
+ variables:
+ RO_API_TOKEN: $PAT
+ CONTAINER_TAG: $CI_PIPELINE_ID
+ CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
+ GITLAB_ENDPOINT: $GITLAB_ENDPOINT
+ PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+ inherit:
+ variables: true
+ rules:
+ - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+ allow_failure: true
+ when: on_success
+ - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
+ when: on_success
+
+test:unit_tests_pyt(DEV)_mcore(legacy):
+ extends: [.unit_tests_run]
+ variables:
+ ENVIRONMENT: dev
+ TAG: legacy
+
+test:unit_tests_pyt(LTS)_mcore(legacy):
+ extends: [.unit_tests_run]
+ variables:
+ ENVIRONMENT: dev
+ TAG: legacy
+
+test:unit_tests_pyt(DEV)_mcore(latest):
+ extends: [.unit_tests_run]
+ variables:
+ ENVIRONMENT: lts
+ TAG: latest
+
+test:unit_tests_pyt(LTS)_mcore(latest):
+ extends: [.unit_tests_run]
+ variables:
+ ENVIRONMENT: lts
+ TAG: latest
+
+test:notify_unit_tests:
+ extends: [.test_rules]
+ image: badouralix/curl-jq
+ needs:
+ - test:unit_tests_pyt(DEV)_mcore(latest)
+ - test:unit_tests_pyt(LTS)_mcore(latest)
+ tags:
+ - mcore-docker-node-small
+ script:
+ - apk add bash
+ - apk add --update coreutils
+ - env
+ - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
+ - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+ - export GITLAB_ENDPOINT
+ - export CONTEXT="unit-tests-extended"
+ - export DATE=$(date +"%Y-%m-%d")
+ - bash tests/test_utils/shell_scripts/notify.sh ${CI_PIPELINE_ID} "test:unit_tests_pyt"
+ artifacts:
+ when: always
+ paths:
+ - scripts
+ rules:
+ - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended"
+ when: always
+ - when: never
+
+test:docs_build:
+ extends: [.test_rules]
+ image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+ tags: [mcore-docker-node-small]
+ needs: [test:build_image]
+ script:
+ - cd ..
+ - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
+ - mv megatron-lm/ documentation/
+ - cd documentation/
+ - ./repo docs
+
+test:formatting:
+ extends: [.test_rules]
+ image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+ tags: [mcore-docker-node-small]
+ needs: [test:build_image]
+ variables:
+ GIT_STRATEGY: 'clone'
+ script:
+ - |
+ if [[ "$CI_PIPELINE_SOURCE" != "merge_request_event" ]]; then
+ exit 0
+ fi
+ - set +e
+ - git fetch origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
+ - git fetch origin main:main
+ - git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
+ - |
+ if [[ "$CI_MERGE_REQUEST_PROJECT_PATH" == "$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH" ]]; then
+ bash tools/autoformat.sh
+ set -e
+ git config --global user.email "mcore-bot@nvidia.com"
+ git config --global user.name "Mcore Bot"
+ git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
+ git add -A .
+ git commit -m "chore: Format files" || true
+ git push -u origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
+ fi
+ - env
+ - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
+
+test:copyright:
+ extends: [.test_rules]
+ tags: [mcore-docker-node-small]
+ image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+ needs: [test:build_image]
+ script:
+ - git fetch origin main
+ - bash tools/copyright.sh
+
+# Override from template
+secret_detection:
+ rules:
+ - when: never
+
+# Inherit and modify template
+test:secret_detection:
+ tags: [mcore-docker-node-small]
+ extends: ['.secret-analyzer']
+ variables:
+ GIT_DEPTH: 0
+ SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
+ allow_failure: true
+ rules:
+ - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+ - when: never
+ script:
+ - apk add jq
+ - /analyzer run
+ - |
+ if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then
+ echo "Atleast one vulnerability has been found"
+ cat gl-secret-detection-report.json | jq '.'
+ exit 1
+ fi
+
+test:pypi_build_wheel:
+ extends: [.test_rules]
+ image:
+ name: quay.io/pypa/manylinux_2_28_x86_64
+ entrypoint: ['']
+ tags: [mcore-docker-node-small]
+ variables:
+ PUBLISH_DRYRUN: 'yes'
+ PY_ENV: pytorch:24.07
+ script:
+ - echo $PUBLISH_DRYRUN
+ - >
+ if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+ PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" megatron/core/package_info.py)
+ sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" megatron/core/package_info.py
+ fi
+ - /opt/python/cp310-cp310/bin/python -m build
+ - /opt/python/cp311-cp311/bin/python -m build
+ - auditwheel repair dist/*.whl
+ artifacts:
+ paths:
+ - megatron/core/package_info.py
+ - wheelhouse/
+
+test:pypi_test_wheel:
+ extends: [.test_rules]
+ image: nvcr.io/nvidia/pytorch:24.01-py3
+ needs: [test:pypi_build_wheel]
+ tags: [mcore-docker-node-small]
+ variables:
+ PUBLISH_DRYRUN: 'yes'
+ script:
+ - EXPECTED_RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
+ - rm -rf megatron
+ - pip install wheelhouse/*cp310*.whl
+
+ - RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
+ - >
+ echo "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
+ - test "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
+ - echo "RELEASE_NUMBER=$EXPECTED_RELEASE_NUMBER" | tee -a build.env
+ artifacts:
+ reports:
+ dotenv: build.env
+ paths:
+ - wheelhouse/
+
+test:pypi_push_wheel:
+ extends: [.test_rules]
+ image: python:3.10
+ tags: [mcore-docker-node-small]
+ needs: [test:pypi_test_wheel]
+ variables:
+ PUBLISH_DRYRUN: 'yes'
+ timeout: 3m
+ script:
+ - >
+ if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+ REPOSITORY=testpypi
+ export TWINE_USERNAME=$TWINE_TEST_USERNAME
+ export TWINE_PASSWORT=$TWINE_TEST_PASSWORD
+ else
+ REPOSITORY=pypi
+ export TWINE_USERNAME=$TWINE_PROD_USERNAME
+ export TWINE_PASSWORT=$TWINE_PROD_PASSWORD
+ fi
+ - pip install twine
+ - >
+ for i in 1 2 3 4 5; do
+ twine upload --verbose -u $TWINE_USERNAME -p $TWINE_PASSWORT --repository $REPOSITORY wheelhouse/* && break || sleep $(( 60*2**i ));
+ done
+ rules:
+ - if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+ allow_failure: true
+ when: on_success
+ - when: on_success
+ allow_failure: true
+
+test:gh_release:
+ extends: [.test_rules]
+ needs: [test:pypi_test_wheel]
+ tags: [mcore-docker-node-small]
+ image: badouralix/curl-jq
+ variables:
+ PUBLISH_DRYRUN: 'yes'
+ script:
+ - NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
+ - CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
+ - CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
+ - >
+ PAYLOAD=$(jq -nc \
+ --arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \
+ --arg NAME "$NAME" \
+ --arg BODY "$CHANGELOG" \
+ '{
+ "tag_name": $CI_COMMIT_BRANCH,
+ "target_commitish": $CI_COMMIT_BRANCH,
+ "name": $NAME,
+ "body": $BODY,
+ "draft": false,
+ "prerelease": false,
+ "generate_release_notes": false
+ }'
+ )
+ - >
+ CMD=$(echo curl -L \
+ -X POST \
+ -H "Accept: application/vnd.github+json" \
+ -H "Authorization: Bearer $GH_TOKEN" \
+ -H "X-GitHub-Api-Version: 2022-11-28" \
+ https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
+ -d "$PAYLOAD"
+ )
+
+ if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
+ echo "$CMD"
+ else
+ eval "$CMD"
+ fi
+
+test:notify_release:
+ needs: [test:pypi_test_wheel, test:pypi_push_wheel, test:gh_release]
+ extends: [.test_rules]
+ image: badouralix/curl-jq
+ tags: [mcore-docker-node-small]
+ variables:
+ PUBLISH_DRYRUN: 'yes'
+ script:
+ - URL="https://github.com/NVIDIA/Megatron-LM/releases/tag/core_r$RELEASE_NUMBER"
+ - >
+ MESSAGE='{
+ "blocks": [
+ {
+ "type": "section",
+ "text": {
+ "type": "mrkdwn",
+ "text": "Releasebot 🤖: Megatron-Core released <'$URL'|core_r'"$RELEASE_NUMBER"'> 🚀"
+ }
+ }
+ ]
+ }'
+ - echo "$MESSAGE"
+ - >
+ CMD=$(echo curl \
+ -X POST \
+ -H "Content-type: application/json" \
+ --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
+ )
+
+ if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
+ echo "$CMD"
+ else
+ eval "$CMD"
+ fi
+
\ No newline at end of file
diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
deleted file mode 100644
index ed80e96fee..0000000000
--- a/.gitlab/stages/01.tests.yml
+++ /dev/null
@@ -1,208 +0,0 @@
-.test_mr_rules:
- rules:
- - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
- allow_failure: true
- when: always
- - when: always
- stage: test
-
-include:
- - template: Security/Secret-Detection.gitlab-ci.yml
-
-build_image:
- extends: [.test_mr_rules]
- tags:
- - ${TAG}
- image: docker:26.1.4-dind
- timeout: 45m
- parallel:
- matrix:
- - IMAGE: CI_MCORE_IMAGE
- FILE: Dockerfile.ci
- BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
- TAG: mcore-docker-node-large
- - IMAGE: CI_MCORE_DEV_IMAGE
- FILE: Dockerfile.ci.dev
- BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
- TAG: mcore-docker-node-large
- - IMAGE: CI_NEMO_IMAGE
- FILE: Dockerfile.ci
- BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
- TAG: mcore-docker-node-large
- - IMAGE: LINTING_IMAGE
- FILE: Dockerfile.linting
- BASE_IMAGE: python:3.10
- TAG: mcore-docker-node-small
- before_script:
- - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
- variables:
- STAGE: main
- script:
- - apk add bash
- - |
- bash -c '
- set -x
- env
- eval "IMAGE=\$$IMAGE"
-
- docker system prune -a --filter "until=24h" -f || true
-
- docker buildx create --name container --driver=docker-container
-
- ADDITIONAL_PARAMS=()
-
- if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
- ADDITIONAL_PARAMS+=("--pull")
- ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
- fi
-
- if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then
- ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
- fi
-
- DOCKER_BUILDKIT=1 docker build \
- --secret id=JET_INDEX_URLS \
- --target $STAGE \
- -f $FILE \
- -t ${IMAGE}:${CI_PIPELINE_ID} \
- --builder=container \
- --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
- --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
- --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
- --cache-from type=registry,ref=${IMAGE}-buildcache:main \
- --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
- --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
- --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
- --push \
- ${ADDITIONAL_PARAMS[@]} .
- '
- retry:
- max: 2
-
-unit_tests:
- # This job runs both test suite of ToT and of a historic ref against
- # the current code. This is a form of backwards compatibility testing
- # and helps in providing stable interfaces.
- extends: [.test_mr_rules]
- image: ${IMAGE}:${CI_PIPELINE_ID}
- needs: [build_image]
- timeout: 180m
- parallel:
- matrix:
- - TAG: latest
- IMAGE: ${CI_MCORE_IMAGE}
- # - TAG: latest
- # IMAGE: ${CI_MCORE_DEV_IMAGE}
- - TAG: core_r0.9.0
- IMAGE: ${CI_MCORE_IMAGE}
- tags: [8xL40S]
- variables:
- GIT_STRATEGY: clone
- GIT_DEPTH: 0
- before_script:
- - |
- if [[ $TAG != latest ]]; then
- git checkout $TAG
- rm -rf /opt/megatron-lm/tests
- cp -r tests/ /opt/megatron-lm
- fi
- script:
- - |
- cd /opt/megatron-lm
- if [[ $UNIT_TEST_REPEAT -eq 0 ]]; then
- exit 0
- fi
-
- for i in $(seq $UNIT_TEST_REPEAT); do
- SEED=$((RANDOM % 9000 + 1000));
- ARGS=()
- if [[ $TAG != latest ]]; then
- ARGS+=(-m "not internal")
- else
- ARGS+=(-m "not flaky")
- fi
- if [[ $IMAGE == ${CI_MCORE_DEV_IMAGE} ]]; then
- ARGS+=(-m "experimental")
- fi
- timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
- done
- artifacts:
- paths:
- - coverage
- rules:
- - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
- allow_failure: true
- when: always
- - when: always
-
-unit-tests-results-notify:
- extends: [.test_mr_rules]
- image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
- needs: [unit_tests]
- tags:
- - mcore-docker-node-small
- script:
- - env
- - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
- - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- - export GITLAB_ENDPOINT
- - export DATE=$(date +"%Y-%m-%d")
- - bash tests/functional_tests/shell_test_utils/notify_unit_tests.sh ${CI_PIPELINE_ID}
- artifacts:
- when: always
- paths:
- - scripts
- rules:
- - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended"
- when: always
- - when: never
-
-docs_build_test:
- extends: [.test_mr_rules]
- image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
- tags: [mcore-docker-node-small]
- needs: [build_image]
- script:
- - cd ..
- - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
- - mv megatron-lm/ documentation/
- - cd documentation/
- - ./repo docs
-
-formatting:
- extends: [.test_mr_rules]
- image: ${LINTING_IMAGE}:${CI_PIPELINE_ID}
- tags: [mcore-docker-node-small]
- needs: [build_image]
- script:
- - env
- - git fetch origin main
- - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
-
-copyright:
- extends: [.test_mr_rules]
- tags: [mcore-docker-node-small]
- image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
- needs: [build_image]
- script:
- - git fetch origin main
- - bash tools/copyright.sh
-
-secret_detection:
- tags: [mcore-docker-node-small]
- variables:
- GIT_DEPTH: 0
- SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
- allow_failure: true
- rules:
- - if: $CI_PIPELINE_SOURCE == "merge_request_event"
- script:
- - apk add jq
- - /analyzer run
- - |
- if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then
- echo "Atleast one vulnerability has been found"
- cat gl-secret-detection-report.json | jq '.'
- exit 1
- fi
\ No newline at end of file
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index 531527b8b4..a128345c28 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -1,4 +1,4 @@
-.jet_common:
+.functional_tests_rules:
stage: functional_tests
rules:
- if: $FUNCTIONAL_TEST == "yes" && ($CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true")
@@ -16,27 +16,28 @@ include:
ref: main
file: downstreams.yml
-jet-build:
- extends: [build_image, .jet_common]
- variables:
- STAGE: jet
-
-jet-generate:
- needs: [jet-build]
- extends: [.jet_common]
- image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
+functional:configure:
+ needs:
+ - test:build_image
+ - job: test:unit_tests_pyt(DEV)_mcore(latest)
+ optional: true
+ - job: test:unit_tests_pyt(LTS)_mcore(latest)
+ optional: true
+ extends: [.functional_tests_rules]
+ image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags: [mcore-docker-node-small]
before_script:
- - git rm -r tests/functional_tests/local_recipes || true
- - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
- - ls tests/functional_tests/local_recipes
- script:
+ - git rm -r tests/test_utils/local_recipes || true
+ - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
+ - ls tests/test_utils/local_recipes
+ script:
- set -x
- |
A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
- |
if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then
+ FUNCTIONAL_TEST_NAME=$(eval echo $FUNCTIONAL_TEST_NAME)
RELEASE_ARGS=(
"--run-name"
$FUNCTIONAL_TEST_NAME
@@ -46,57 +47,92 @@ jet-generate:
else
RELEASE_ARGS=()
fi
-
- |
export PYTHONPATH=$(pwd)
- python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
+ python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
+ --environment dev \
+ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
+ --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
+ --test-cases $FUNCTIONAL_TEST_CASES \
--a100-cluster $A100_CLUSTER \
--h100-cluster $H100_CLUSTER \
+ --container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
- --container-image ${CI_MCORE_IMAGE} \
- --container-image-dev ${CI_MCORE_DEV_IMAGE} \
- --output-path "jet-trigger-job.yaml" \
+ --dependent-job "functional:configure" \
+ --output-path "functional-test-job-dev.yaml" \
+ ${RELEASE_ARGS[@]}
+ - |
+ export PYTHONPATH=$(pwd)
+ python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+ --scope $FUNCTIONAL_TEST_SCOPE \
+ --environment lts \
+ --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
+ --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
+ --test-cases $FUNCTIONAL_TEST_CASES \
+ --a100-cluster $A100_CLUSTER \
+ --h100-cluster $H100_CLUSTER \
+ --container-image ${UTILITY_IMAGE} \
+ --container-tag ${CI_PIPELINE_ID} \
+ --dependent-job "functional:configure" \
+ --output-path "functional-test-job-lts.yaml" \
${RELEASE_ARGS[@]}
artifacts:
paths:
- - jet-trigger-job.yaml
- - tests/functional_tests/local_recipes
+ - functional-test-job-lts.yaml
+ - functional-test-job-dev.yaml
+ - tests/test_utils/local_recipes
-jet-trigger:
+.run:
stage: functional_tests
- needs: [jet-generate]
- extends: [.jet_common]
+ needs: [functional:configure]
+ extends: [.functional_tests_rules]
trigger:
include:
- - artifact: jet-trigger-job.yaml
- job: jet-generate
+ - artifact: functional-test-job-$ENVIRONMENT.yaml
+ job: functional:configure
strategy: depend
variables:
RO_API_TOKEN: $PAT
CONTAINER_TAG: $CI_PIPELINE_ID
- CI_MCORE_IMAGE: $CI_MCORE_IMAGE
+ CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
GITLAB_ENDPOINT: $GITLAB_ENDPOINT
PARENT_PIPELINE_ID: $CI_PIPELINE_ID
inherit:
variables: true
-
-jet-results-notify:
- extends: [.jet_common]
- image: ${GITLAB_ENDPOINT}:5005/dl/jet/api:latest
- needs: [jet-trigger]
+
+functional:run_lts:
+ extends: [.run]
+ variables:
+ ENVIRONMENT: lts
+
+functional:run_dev:
+ extends: [.run]
+ variables:
+ ENVIRONMENT: dev
+
+functional:notify:
+ extends: [.functional_tests_rules]
+ image: badouralix/curl-jq
+ needs:
+ - functional:run_lts
+ - functional:run_dev
tags:
- mcore-docker-node-small
- before_script:
- - jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN
+ variables:
+ WEBHOOK_URL: ${MCORE_NOTIFICATION_HOOK}
+ RO_API_TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}
+ CONTEXT: $FUNCTIONAL_TEST_SCOPE
script:
+ - apk add bash
+ - apk add --update coreutils
- env
- export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- export CONTEXT=$FUNCTIONAL_TEST_SCOPE
- export DATE=$(date +"%Y-%m-%d")
- - bash tests/functional_tests/shell_test_utils/notify.sh ${CI_PIPELINE_ID}
+ - bash tests/test_utils/shell_scripts/notify.sh ${CI_PIPELINE_ID} "functional:run_"
artifacts:
when: always
paths:
@@ -105,4 +141,3 @@ jet-results-notify:
- if: $CI_PIPELINE_SOURCE == "schedule" && $FUNCTIONAL_TEST == "yes"
when: always
- when: never
-
diff --git a/.gitlab/stages/03.publish.yml b/.gitlab/stages/03.publish.yml
index 1deeee7285..4639d7690f 100644
--- a/.gitlab/stages/03.publish.yml
+++ b/.gitlab/stages/03.publish.yml
@@ -1,24 +1,28 @@
.publish_common_freeze:
- stage: functional_tests
+ stage: publish
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $PUBLISH == "yes" && $PUBLISH_SCOPE == "code-freeze"
when: manual
- when: never
.publish_common_release:
- stage: functional_tests
+ stage: publish
rules:
- if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
when: manual
+ - if: $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
+ when: manual
+ variables:
+ PUBLISH_DRYRUN: "yes"
- when: never
-create-release-branch:
+publish:release_branch:
extends: [.publish_common_freeze]
- image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
- needs: [build_image]
+ image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
+ needs: [test:build_image]
tags: [mcore-docker-node-small]
variables:
- GIT_STRATEGY: "clone"
+ GIT_STRATEGY: "none"
script:
- git fetch origin $CI_DEFAULT_BRANCH
- git config --global user.email "mcore-bot@nvidia.com"
@@ -26,8 +30,8 @@ create-release-branch:
- git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" megatron/core/package_info.py
- VERSION=$(python -c "from megatron import core; print(core.__version__)")
- - git switch --force-create core_r$VERSION origin/$CI_DEFAULT_BRANCH
- - git push -u origin core_r$VERSION --force
+ - RELEASE_BRANCH=core_r$VERSION
+ - git switch --force-create $RELEASE_BRANCH origin/$CI_DEFAULT_BRANCH
- |
MESSAGE='{
"blocks": [
@@ -35,61 +39,53 @@ create-release-branch:
"type": "section",
"text": {
"type": "mrkdwn",
- "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `core_r$VERSION`"
+ "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `'"$RELEASE_BRANCH"'`"
}
}
]
}'
-
+ - >
curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
+ - git switch --force-create bot/chore/bump-version
+ - git add megatron/core/package_info.py
+ - >
+ git commit -m "chore: adjust version version"
+ - git push -u origin bot/chore/bump-version
+ - >
+ curl \
+ --header "PRIVATE-TOKEN: $PAT" \
+ --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
+ -d "source_branch=bot/chore/bump-version" \
+ -d "target_branch=$RELEASE_BRANCH" \
+ -d "title=chore: Fix version of \`$RELEASE_BRANCH\`" \
+ -d "description=[🤖]: Hi @okoenig 👋,
we've adjusted the version number of \`$RELEASE_BRANCH\` for you! 🚀
Please review and approve this cherry pick by your convenience\!"
-publish-wheel:
- extends: [.publish_common_release]
- image: quay.io/pypa/manylinux_2_28_x86_64
- tags: [mcore-docker-node-small]
- script:
- - export TWINE_USERNAME
- - export TWINE_PASSWORT
- - /opt/python/cp311-cp311/bin/pip install twine
- - /opt/python/cp310-cp310/bin/python -m build
- - /opt/python/cp311-cp311/bin/python -m build
- - auditwheel repair dist/*.whl
- - twine upload --repository pypi wheelhouse/*
-
-create-gh-release:
- extends: [.publish_common_release]
- tags: [mcore-docker-node-small]
- image:
- name: registry.gitlab.com/gitlab-ci-utils/curl-jq
- entrypoint: [""]
- script:
- - |
- RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
- NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
- CHANGELOG=$(awk '/^## '$NAME'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
- CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
-
- PAYLOAD=$(jq \
- -n \
- -c \
- --arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \
- --arg NAME "$NAME" \
- --arg BODY "$CHANGELOG" \
- '{
- "tag_name": $CI_COMMIT_BRANCH,
- "target_commitish": $CI_COMMIT_BRANCH,
- "name": $NAME,
- "body": $BODY,
- "draft": false,
- "prerelease": false,
- "generate_release_notes": false
- }'
- )
+publish:pypi_build_wheel:
+ extends: [test:pypi_build_wheel, .publish_common_release]
+ dependencies: []
+ variables:
+ PUBLISH_DRYRUN: "no"
+
+publish:pypi_test_wheel:
+ extends: [test:pypi_test_wheel, .publish_common_release]
+ needs: [publish:pypi_build_wheel]
+ variables:
+ PUBLISH_DRYRUN: "no"
+
+publish:pypi_push_wheel:
+ extends: [test:pypi_push_wheel, .publish_common_release]
+ needs: [publish:pypi_test_wheel]
+ variables:
+ PUBLISH_DRYRUN: "no"
- curl -L \
- -X POST \
- -H "Accept: application/vnd.github+json" \
- -H "Authorization: Bearer $GH_TOKEN" \
- -H "X-GitHub-Api-Version: 2022-11-28" \
- https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
- -d $PAYLOAD
\ No newline at end of file
+publish:gh_release:
+ extends: [test:gh_release, .publish_common_release]
+ dependencies: []
+ variables:
+ PUBLISH_DRYRUN: "no"
+
+publish:notify_release:
+ needs: [publish:pypi_push_wheel, publish:gh_release]
+ extends: [test:notify_release, .publish_common_release]
+ variables:
+ PUBLISH_DRYRUN: "no"
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 78db8212aa..7960574199 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,23 @@
# Changelog
+## NVIDIA Megatron Core 0.9.0
+
+- Uneven pipeline parallelism
+ - Enable pipeline parallelism where first and last ranks have fewer transformer layers than the intermediate ranks
+- Per layer CUDAGraph support for GPT training with Transformer Engine modules
+- Enable different TP sizes for the vision encoder
+- Enable pipeline parallelism for T5 & Llava models
+- Support multi-tile multi-image input in Llava models
+- MoE
+ - FP8 support
+ - Runtime upcycling support
+ - Dispatcher implementation optimizations
+ - Shared expert support with overlapping optimizations
+ - Qwen Model support
+- Known Issues
+ - When using sequence parallel, during the transformer block forward pass, dropout is not using the appropriate rng context.
+
+
## NVIDIA Megatron Core 0.8.0
- Multimodal
diff --git a/CODEOWNERS b/CODEOWNERS
index 7e7f730e3a..e89c62b06e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -34,14 +34,16 @@ megatron/core/optimizer/distrib_optimizer/
[Inference] @mcore-reviewers/inference
megatron/core/inference/
-[Quantization and Inference (QAT)] @mcore-reviewers/quantization-and-inference
+^[Quantization and Inference (QAT)] @mcore-reviewers/quantization-and-inference
megatron/core/inference/
; [Context Parallelism] @mcore-reviewers/context-parallelism
;
[CI] @mcore-reviewers/ci
+.gitlab/
+.github/
.gitlab-ci.yml
-Dockerfile.ci
-jet-tests.yml
+Dockerfile.ci.lts
+Dockerfile.ci.dev
tests/
diff --git a/Dockerfile.ci b/Dockerfile.ci
deleted file mode 100644
index f1b693b9d9..0000000000
--- a/Dockerfile.ci
+++ /dev/null
@@ -1,63 +0,0 @@
-# syntax=docker/dockerfile:1.3-labs
-
-ARG FROM_IMAGE_NAME
-FROM $FROM_IMAGE_NAME as build_causal_conv1d
-WORKDIR /opt
-RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
-
-FROM $FROM_IMAGE_NAME as build_grouped_gemm
-WORKDIR /opt
-RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
-
-FROM $FROM_IMAGE_NAME as build_mamba_ssm
-WORKDIR /opt
-RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
-
-FROM $FROM_IMAGE_NAME as main
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && \
- apt-get install -y --no-install-recommends gettext python3-venv && \
- apt-get clean && \
- python -m venv /opt/jet && \
- wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
- chmod a+x /usr/local/bin/yq
-
-COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
-COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
-COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
-
-RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
-einops \
-flask-restful \
-nltk \
-pytest \
-pytest-cov \
-pytest_mock \
-pytest-random-order \
-sentencepiece \
-tiktoken \
-wrapt \
-zarr \
-wandb \
-triton==2.1.0 \
-causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
-mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
-grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
-tensorstore==0.1.45 && \
-rm *.whl
-
-# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
-COPY . /opt/megatron-lm
-RUN pip install /opt/megatron-lm
-ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
-
-##### For NVIDIANS only #####
-FROM main as jet
-ARG CACHEBUST=0
-RUN --mount=type=secret,id=JET_INDEX_URLS \
- JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
- pip install jet-client --upgrade $JET_INDEX_URLS && \
- /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS
-ENV PATH="$PATH:/opt/jet/bin"
-###
\ No newline at end of file
diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev
index f1b693b9d9..c631282c2d 100644
--- a/Dockerfile.ci.dev
+++ b/Dockerfile.ci.dev
@@ -11,7 +11,7 @@ RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
FROM $FROM_IMAGE_NAME as build_mamba_ssm
WORKDIR /opt
-RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.2.0
FROM $FROM_IMAGE_NAME as main
ENV DEBIAN_FRONTEND=noninteractive
@@ -23,33 +23,47 @@ RUN apt-get update && \
wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
chmod a+x /usr/local/bin/yq
-COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
-COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
-COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
-
-RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
-einops \
-flask-restful \
-nltk \
-pytest \
-pytest-cov \
-pytest_mock \
-pytest-random-order \
-sentencepiece \
-tiktoken \
-wrapt \
-zarr \
-wandb \
-triton==2.1.0 \
-causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
-mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
-grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
-tensorstore==0.1.45 && \
-rm *.whl
+COPY --from=build_causal_conv1d /opt/causal_conv1d-*.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-*.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-*.whl ./
+
+RUN \
+ --mount=type=bind,source=requirements,target=requirements \
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+ --mount=type=bind,source=setup.py,target=setup.py \
+ --mount=type=bind,source=megatron/core/package_info.py,target=megatron/core/package_info.py \
+ --mount=type=bind,source=megatron/core/README.md,target=megatron/core/README.md \
+ --mount=type=bind,source=megatron/core/__init__.py,target=megatron/core/__init__.py <<"EOF" bash -ex
+
+pip install causal_conv1d-*.whl mamba_ssm-*.whl grouped_gemm-*.whl
+PY_ENV=pytorch:24.07 pip install .
+EOF
# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
-COPY . /opt/megatron-lm
-RUN pip install /opt/megatron-lm
+ARG MCORE_REPO
+ARG MCORE_REF
+ARG MCORE_BACKWARDS_REF
+RUN <<"EOF" bash -exu
+# Checkout latest
+cd /opt
+rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+git fetch origin $MCORE_REF
+git checkout $MCORE_REF
+
+# Checkout backwards-ref
+cd /opt
+rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin $MCORE_BACKWARDS_REF
+git checkout $MCORE_BACKWARDS_REF
+rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+EOF
+
+RUN PY_ENV=pytorch:24.07 pip install -e /opt/megatron-lm
ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
##### For NVIDIANS only #####
@@ -57,7 +71,6 @@ FROM main as jet
ARG CACHEBUST=0
RUN --mount=type=secret,id=JET_INDEX_URLS \
JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
- pip install jet-client --upgrade $JET_INDEX_URLS && \
- /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS
+ pip install jet-client jet-api --upgrade $JET_INDEX_URLS
ENV PATH="$PATH:/opt/jet/bin"
###
\ No newline at end of file
diff --git a/Dockerfile.ci.lts b/Dockerfile.ci.lts
new file mode 100644
index 0000000000..ea0cf31a0b
--- /dev/null
+++ b/Dockerfile.ci.lts
@@ -0,0 +1,77 @@
+# syntax=docker/dockerfile:1.3-labs
+
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME as build_causal_conv1d
+WORKDIR /opt
+RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
+
+FROM $FROM_IMAGE_NAME as build_grouped_gemm
+WORKDIR /opt
+RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
+
+FROM $FROM_IMAGE_NAME as build_mamba_ssm
+WORKDIR /opt
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
+
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends gettext python3-venv && \
+ apt-get clean && \
+ python -m venv /opt/jet && \
+ wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
+ chmod a+x /usr/local/bin/yq
+
+COPY --from=build_causal_conv1d /opt/causal_conv1d-*.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-*.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-*.whl ./
+
+RUN \
+ --mount=type=bind,source=requirements,target=requirements \
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+ --mount=type=bind,source=setup.py,target=setup.py \
+ --mount=type=bind,source=megatron/core/package_info.py,target=megatron/core/package_info.py \
+ --mount=type=bind,source=megatron/core/README.md,target=megatron/core/README.md \
+ --mount=type=bind,source=megatron/core/__init__.py,target=megatron/core/__init__.py <<"EOF" bash -ex
+
+pip install causal_conv1d-*.whl mamba_ssm-*.whl grouped_gemm-*.whl
+PY_ENV=pytorch:24.07 pip install .
+EOF
+
+# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
+ARG MCORE_REPO
+ARG MCORE_REF
+ARG MCORE_BACKWARDS_REF
+RUN <<"EOF" bash -exu
+# Checkout latest
+cd /opt
+rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+git fetch origin $MCORE_REF
+git checkout $MCORE_REF
+
+# Checkout backwards-ref
+cd /opt
+rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin $MCORE_BACKWARDS_REF
+git checkout $MCORE_BACKWARDS_REF
+rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+EOF
+
+RUN PY_ENV=pytorch:24.01 pip install -e /opt/megatron-lm
+ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
+
+##### For NVIDIANS only #####
+FROM main as jet
+ARG CACHEBUST=0
+RUN --mount=type=secret,id=JET_INDEX_URLS \
+ JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+ pip install jet-api jet-client --upgrade $JET_INDEX_URLS
+ENV PATH="$PATH:/opt/jet/bin"
+###
\ No newline at end of file
diff --git a/Dockerfile.linting b/Dockerfile.linting
index b0670af9d1..ff1a28cefd 100644
--- a/Dockerfile.linting
+++ b/Dockerfile.linting
@@ -7,15 +7,27 @@ ENV DEBIAN_FRONTEND=noninteractive
RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
/etc/apt/apt.conf.d/docker-clean
+RUN apt-get update && \
+ apt-get install -y python3-venv && \
+ apt-get clean && \
+ python -m venv /opt/jet
RUN pip3 install --no-cache-dir \
black==24.4.2 \
isort==5.13.2 \
flake8==7.1.0 \
- pylint==3.2.6
+ pylint==3.2.6 \
+ mypy
COPY . /opt/megatron-lm
WORKDIR /opt/megatron-lm
-FROM main as jet
\ No newline at end of file
+##### For NVIDIANS only #####
+FROM main as jet
+ARG CACHEBUST=0
+RUN --mount=type=secret,id=JET_INDEX_URLS \
+ JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+ pip install jet-client jet-api --upgrade $JET_INDEX_URLS
+ENV PATH="$PATH:/opt/jet/bin"
+###
\ No newline at end of file
diff --git a/Dockerfile_rocm.ci b/Dockerfile_rocm.ci
old mode 100644
new mode 100755
index 253b446a29..d397f469c3
--- a/Dockerfile_rocm.ci
+++ b/Dockerfile_rocm.ci
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=rocm/pytorch:rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.5.1_preview
FROM $BASE_DOCKER
ARG PYTORCH_ROCM_ARCH_OVERRIDE="gfx942"
@@ -60,11 +60,19 @@ RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git &&\
cd TransformerEngine &&\
pip install .
+RUN git clone https://github.com/caaatch22/grouped_gemm.git &&\
+ cd grouped_gemm &&\
+ git checkout rocm &&\
+ git submodule update --init --recursive &&\
+ pip install .
+
WORKDIR $WORKSPACE_DIR
COPY . Megatron-LM
WORKDIR $WORKSPACE_DIR/Megatron-LM
RUN pip install -e .
+ENV PYTHONPATH=/var/lib/jenkins/triton/python
+
# record configuration for posterity
RUN pip list
diff --git a/Dockerfile_rocm.dev b/Dockerfile_rocm.dev
old mode 100644
new mode 100755
index d253193b67..6ab2bbbe4b
--- a/Dockerfile_rocm.dev
+++ b/Dockerfile_rocm.dev
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER=rocm/pytorch:latest
+ARG BASE_DOCKER=rocm/pytorch:rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.5.1_preview
FROM $BASE_DOCKER
ARG PYTORCH_ROCM_ARCH_OVERRIDE="gfx942"
ENV WORKSPACE_DIR=/workspace
@@ -59,6 +59,12 @@ RUN git clone --recursive https://github.com/ROCm/TransformerEngine &&\
cd TransformerEngine &&\
pip install .
+RUN git clone https://github.com/caaatch22/grouped_gemm.git &&\
+ cd grouped_gemm &&\
+ git checkout rocm &&\
+ git submodule update --init --recursive &&\
+ pip install .
+
WORKDIR $WORKSPACE_DIR
RUN git clone https://github.com/ROCm/Megatron-LM.git Megatron-LM &&\
cd Megatron-LM &&\
@@ -67,6 +73,8 @@ RUN git clone https://github.com/ROCm/Megatron-LM.git Megatron-LM &&\
WORKDIR $WORKSPACE_DIR/Megatron-LM
+ENV PYTHONPATH=/var/lib/jenkins/triton/python
+
# record configuration for posterity
RUN pip list
diff --git a/LICENSE b/LICENSE
index 16814e9d13..b4193aff50 100644
--- a/LICENSE
+++ b/LICENSE
@@ -30,8 +30,9 @@ The following applies to all files unless otherwise noted:
This repository also contains code from Hugging Face Inc., Google Research,
Facebook (from their Fairseq, Dino, and ParlAI projects), Microsoft (from their
-Swin-Transformer project), Philip Popien, and the Mamba project (Tri Dao and
-Albert Gu). Files from these organizations have notices at the top of each file.
+Swin-Transformer project), Philip Popien, the Mamba project (Tri Dao and
+Albert Gu), and the Triton language and compiler project (Philippe Tillet and
+OpenAI). Files from these organizations have notices at the top of each file.
Below are licenses used in those files, as indicated.
@@ -241,7 +242,13 @@ Below are licenses used in those files, as indicated.
See the License for the specific language governing permissions and
limitations under the License.
-------------- LICENSE FOR Facebook, Inc. and its affiliates, Meta Platforms, Inc. and its affiliates, Microsoft Corporation, and OpenGVLab/InternVL --------------
+--------------------------------------------------------------------------------
+LICENSE FOR
+Facebook, Inc. and its affiliates,
+Meta Platforms, Inc. and its affiliates,
+Microsoft Corporation,
+OpenGVLab/InternVL, and
+Triton language and compiler.
MIT License
diff --git a/MANIFEST.in b/MANIFEST.in
index dbb29b0a1c..dbed9c4061 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
include megatron/core/requirements.txt
-include megatron/core/README.md
\ No newline at end of file
+include megatron/core/README.md
+recursive-include requirements *
diff --git a/README.md b/README.md
index 138944b5cd..a8e553deca 100644
--- a/README.md
+++ b/README.md
@@ -19,41 +19,50 @@ Megatron-LM & Megatron-Core
# Table of Contents
- * [Megatron Overview](#megatron-overview)
- * [Megatron-LM](#megatron-lm)
- * [Megatron-Core](#megatron-core)
- * [Training Speed and Scalability](#training-speed-and-scalability)
- * [Setup](#setup)
- * [Downloading Checkpoints](#downloading-checkpoints)
- * [Usage](#usage)
- * [Training](#training)
- * [Data Preprocessing](#data-preprocessing)
- * [BERT Pretraining](#bert-pretraining)
- * [GPT Pretraining](#gpt-pretraining)
- * [T5 Pretraining](#t5-pretraining)
- * [Distributed Pretraining](#distributed-pretraining)
- * [Activation Checkpointing and Recomputation](#activation-checkpointing-and-recomputation)
- * [Distributed Optimizer](#distributed-optimizer)
- * [FlashAttention](#flashattention)
- * [GPT-3 Example](#gpt-3-example)
- * [Retro and InstructRetro](#retro-and-instructretro)
- * [Evaluation and Tasks](#evaluation-and-tasks)
- * [GPT Text Generation](#gpt-text-generation)
- * [GPT Evaluation](#gpt-evaluation)
- * [WikiText Perplexity Evaluation](#wikitext-perplexity-evaluation)
- * [LAMBADA Cloze Accuracy](#lambada-cloze-accuracy)
- * [BERT Task Evaluation](#bert-task-evaluation)
- * [RACE Evaluation](#race-evaluation)
- * [MNLI Evaluation](#mnli-evaluation)
- * [Llama-2 Inference and Finetuning](#llama-2-inference-and-finetuning)
- * [Datasets](#datasets)
- * [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
- * [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
- * [Reproducibility](#reproducibility)
- * [Projects using Megatron](#projects-using-megatron)
+- [Megatron-LM \& Megatron-Core](#megatron-lm--megatron-core)
+- [Latest News](#latest-news)
+- [Table of Contents](#table-of-contents)
+- [Megatron Overview](#megatron-overview)
+ - [Megatron-LM](#megatron-lm)
+ - [Megatron-Core](#megatron-core)
+- [Training Speed and Scalability](#training-speed-and-scalability)
+- [Setup](#setup)
+ - [Downloading Checkpoints](#downloading-checkpoints)
+- [Usage](#usage)
+- [Training](#training)
+ - [Data Preprocessing](#data-preprocessing)
+ - [BERT Pretraining](#bert-pretraining)
+ - [GPT Pretraining](#gpt-pretraining)
+ - [T5 Pretraining](#t5-pretraining)
+ - [Distributed Pretraining](#distributed-pretraining)
+ - [Activation Checkpointing and Recomputation](#activation-checkpointing-and-recomputation)
+ - [Distributed Optimizer](#distributed-optimizer)
+ - [FlashAttention](#flashattention)
+ - [GPT-3 Example](#gpt-3-example)
+ - [Retro and InstructRetro](#retro-and-instructretro)
+ - [Mamba-based Language Models](#mamba-based-language-models)
+ - [Mixture of Experts](#mixture-of-experts)
+ - [Key Features of MoE](#key-features-of-moe)
+- [Evaluation and Tasks](#evaluation-and-tasks)
+ - [GPT Text Generation](#gpt-text-generation)
+ - [Detoxify GPT via Self-generation](#detoxify-gpt-via-self-generation)
+ - [GPT Evaluation](#gpt-evaluation)
+ - [WikiText Perplexity Evaluation](#wikitext-perplexity-evaluation)
+ - [LAMBADA Cloze Accuracy](#lambada-cloze-accuracy)
+ - [BERT Task Evaluation](#bert-task-evaluation)
+ - [RACE Evaluation](#race-evaluation)
+ - [MNLI Evaluation](#mnli-evaluation)
+ - [Llama-2 Inference and Finetuning](#llama-2-inference-and-finetuning)
+- [Model Optimization and Deployment](#model-optimization-and-deployment)
+ - [Quantization and TensorRT-LLM Deployment](#quantization-and-tensorrt-llm-deployment)
+- [Datasets](#datasets)
+ - [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
+ - [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
+- [Reproducibility](#reproducibility)
+ - [Projects Using Megatron](#projects-using-megatron)
# Megatron Overview
-This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a ressearch-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
+This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a research-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
## Megatron-LM
First introduced in 2019, Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) sparked a wave of innovation in the AI community, enabling researchers and developers to utilize the underpinnings of this library to further LLM advancements. Today, many of the most popular LLM developer frameworks have been inspired by and built directly leveraging the open-source Megatron-LM library, spurring a wave of foundation models and AI startups. Some of the most popular LLM frameworks built on top of Megatron-LM include [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [HuggingFace Accelerate](https://github.com/huggingface/accelerate), and [NVIDIA NeMo Framework](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/). A list of projects that have directly used Megatron can be found [here](#projects-using-megatron).
@@ -362,6 +371,17 @@ python tools/create_doc_index.py \
-->
+## Mixture of Experts
+MoE (Mixture of Experts) is a powerful LLM architecture implemented in the Megatron-Core framework, designed to enhance the efficiency and scalability of large language models. It leverages **Expert Parallelism**, allowing multiple experts to be distributed across different workers, where each worker processes distinct batches of training samples. This method significantly increases computational throughput, enabling models to achieve high performance metrics, such as 47% MFU during BF16 training for 8x7B on H100.
+
+Key Features of MoE:
+- **Parallelism Techniques**: MoE combines various parallelism strategies, including Expert Parallelism, Data Parallelism, Tensor Parallelism, Sequence Paralleism, Pipeline Parallelism, and Context Parallelism. This combination allows for handling larger model variants effectively.
+- **Router and Load Balancing**: The system employs advanced routing mechanisms like the Top-K router and utilizes load balancing algorithms to optimize token distribution among experts.
+- **Performance Optimizations**: Techniques such as GroupedGEMM and FP8 training enhance the efficiency of MoE models, particularly when multiple experts are involved.
+- **Token Dispatch Mechanism**: MoE supports both dropless and token drop strategies to manage token distribution effectively across experts.
+
+For a comprehensive overview of MoE training configurations and optimizations, please refer to the detailed README located at [megatron/core/transformer/moe/README.md](./megatron/core/transformer/moe/README.md).
+
# Evaluation and Tasks
We provide several command line arguments, detailed in the scripts listed below, to handle various zero-shot and fine-tuned downstream tasks. However, you can also finetune your model from a pretrained checkpoint on other corpora as desired. To do so, simply add the `--finetune` flag and adjust the input files and training parameters within the original training script. The iteration count will be reset to zero, and the optimizer and internal state will be reinitialized. If the fine-tuning is interrupted for any reason, be sure to remove the `--finetune` flag before continuing, otherwise the training will start again from the beginning.
@@ -540,7 +560,7 @@ python tasks/main.py \
The Llama-2 [family of models](https://ai.meta.com/llama/) are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At the time of release, Llama-2 models achieved among the best results for open-source models, and were competitive with the closed-source GPT-3.5 model (see https://arxiv.org/pdf/2307.09288.pdf).
-The Llama-2 checkpoints can be loaded into Megatron for inference and finetuning. See documentation [here](docs/llama2.md).
+The Llama-2 checkpoints can be loaded into Megatron for inference and finetuning. See documentation [here](docs/llama_mistral.md).
# Model Optimization and Deployment
Megatron-Core (MCore) `GPTModel` family supports advanced quantization algorithms and high-performance inference through TensorRT-LLM.
diff --git a/docs/source/api-guide/context_parallel.rst b/docs/source/api-guide/context_parallel.rst
index c381f66e8b..c08defd210 100644
--- a/docs/source/api-guide/context_parallel.rst
+++ b/docs/source/api-guide/context_parallel.rst
@@ -25,7 +25,7 @@ Context parallelism benefits
LLM encounters OOM (out of memory) issue with long context (i.e., long sequence length) because of linearly increasing memory footprint of activations. Recomputing activations in backward can avoid OOM but also introduce significant overheads (~30% with full recompute). Enlarging TP (tensor model parallelism) can fix the OOM issue as well, but it potentially makes compute (e.g., Linear) too short to overlap communication latencies. To be clear, scaling out to more GPUs with bigger TP can hit the overlapping problem no matter if OOM happens.
-CP can better address the issues. With CP, each GPU only computes on a part of the sequence, which reduces both computation and communication by CP times. Therefore, there are no concerns about the overlapping between them. The activation memory footprint per GPU is also CP times smaller, hence no OOM issue any more. As Figure 2 shows, the combinations of TP and CP can achieve optimal performance by eliminating recompute overheads and making the best tradeoff between computation and communications.
+CP can better address the issues. With CP, each GPU only computes on a part of the sequence, which reduces both computation and communication by CP times. Therefore, there are no concerns about the overlapping between them. The activation memory footprint per GPU is also CP times smaller, hence no OOM issue anymore. As Figure 2 shows, the combinations of TP and CP can achieve optimal performance by eliminating recompute overheads and making the best tradeoff between computation and communications.
Enabling context parallelism
----------------------------
diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh
index dada370a94..f0d9c87c8b 100644
--- a/examples/bert/train_bert_340m_distributed.sh
+++ b/examples/bert/train_bert_340m_distributed.sh
@@ -30,6 +30,7 @@ BERT_MODEL_ARGS=(
--num-attention-heads 16
--seq-length 512
--max-position-embeddings 512
+ --attention-backend auto # Can use (flash/fused/unfused/local)
)
TRAINING_ARGS=(
diff --git a/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py b/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py
new file mode 100644
index 0000000000..65d0727d8c
--- /dev/null
+++ b/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Pretrain GPT."""
+import os
+import sys
+from functools import partial
+
+# This file isn't located in project root, but to import, it should pretend to be.
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+
+from megatron.core import mpu
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.enums import ModelType
+from megatron.core.models.gpt import GPTModel
+from megatron.core.utils import StragglerDetector
+from megatron.inference.arguments import add_modelopt_args
+from megatron.inference.gpt import loss_func, model_provider
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain
+from megatron.training.utils import (
+ get_batch_on_this_cp_rank,
+ get_batch_on_this_tp_rank,
+ print_rank_0,
+)
+
+stimer = StragglerDetector()
+
+
+def get_batch(data_iterator):
+ """Generate a batch."""
+
+ # TODO: this is pretty hacky, find a better way
+ if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+ return None, None, None, None, None
+
+ # get batches based on the TP rank you are on
+ batch = get_batch_on_this_tp_rank(data_iterator)
+
+ # slice batch along sequence dimension for context parallelism
+ batch = get_batch_on_this_cp_rank(batch)
+
+ return batch.values()
+
+
+def forward_step(data_iterator, model: GPTModel):
+ """Forward training step.
+
+ Args:
+ data_iterator : Input data iterator
+ model (GPTModel): The GPT Model
+ """
+ timers = get_timers()
+
+ # Get the batch.
+ timers('batch-generator', log_level=2).start()
+ global stimer
+ with stimer(bdata=True):
+ tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator)
+ timers('batch-generator').stop()
+
+ with stimer:
+ output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
+
+ # [ModelOpt]: model is needed to access ModelOpt distillation losses
+ return output_tensor, partial(loss_func, loss_mask, model)
+
+
+def is_dataset_built_on_rank():
+ return (
+ mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+ ) and mpu.get_tensor_model_parallel_rank() == 0
+
+
+def core_gpt_dataset_config_from_args(args):
+ tokenizer = get_tokenizer()
+
+ return GPTDatasetConfig(
+ random_seed=args.seed,
+ sequence_length=args.seq_length,
+ blend=get_blend_from_list(args.data_path),
+ blend_per_split=[
+ get_blend_from_list(args.train_data_path),
+ get_blend_from_list(args.valid_data_path),
+ get_blend_from_list(args.test_data_path),
+ ],
+ split=args.split,
+ num_dataset_builder_threads=args.num_dataset_builder_threads,
+ path_to_cache=args.data_cache_path,
+ mmap_bin_files=args.mmap_bin_files,
+ tokenizer=tokenizer,
+ reset_position_ids=args.reset_position_ids,
+ reset_attention_mask=args.reset_attention_mask,
+ eod_mask_loss=args.eod_mask_loss,
+ create_attention_mask=args.create_attention_mask_in_dataloader,
+ )
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+ """Build the train test and validation datasets.
+
+ Args:
+ train_val_test_num_samples : A list containing the number of samples in train test and validation.
+ """
+ args = get_args()
+
+ config = core_gpt_dataset_config_from_args(args)
+
+ if args.mock_data:
+ dataset_type = MockGPTDataset
+ else:
+ dataset_type = GPTDataset
+
+ print_rank_0("> building train, validation, and test datasets for GPT ...")
+
+ train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+ dataset_type, train_val_test_num_samples, is_dataset_built_on_rank, config
+ ).build()
+
+ print_rank_0("> finished creating GPT datasets ...")
+
+ return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+ # Temporary for transition to core datasets
+ train_valid_test_datasets_provider.is_distributed = True
+
+ pretrain(
+ train_valid_test_datasets_provider,
+ model_provider,
+ ModelType.encoder_or_decoder,
+ forward_step,
+ args_defaults={"tokenizer_type": "GPT2BPETokenizer"},
+ extra_args_provider=add_modelopt_args,
+ )
diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md
index c5255f7ccf..abaa0d7645 100644
--- a/examples/export/ptq_and_trtllm_export/README.md
+++ b/examples/export/ptq_and_trtllm_export/README.md
@@ -250,4 +250,46 @@ python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokeniz
python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
#For llama-3.1
+```
+
+
+### Mixtral-8x7B FP8 Quantization and TensorRT-LLM Deployment
+First download the nemotron checkpoint from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mixtral-8x7b-v01, extract the
+sharded checkpoint from the `.nemo` tarbal.
+
+```sh
+ngc registry model download-version "nvidia/nemo/mixtral-8x7b-v01:1.0"
+cd mixtral-8x7b-v01_v1.0
+tar -xvf mixtral.nemo
+cd ..
+```
+
+Then log in to huggingface so that you can access to model
+
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to mistralai/Mixtral-8x7B-v0.1 on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
+Now launch the PTQ + TensorRT-LLM checkpoint export script,
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh ./mixtral-8x7b-v01_v1.0/
+```
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+ --checkpoint_dir /tmp/trtllm_ckpt \
+ --output_dir /tmp/trtllm_engine \
+ --max_input_len 2048 \
+ --max_seq_len 512 \
+ --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mixtral-8x7B-v0.1
```
\ No newline at end of file
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
index a6251663f7..94ee12db41 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
@@ -7,11 +7,6 @@ NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
-# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
-export NVTE_FLASH_ATTN=0
-export NVTE_FUSED_ATTN=0
-export NVTE_UNFUSED_ATTN=1
-
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="1"
INFERENCE_TP=${TP}
@@ -37,6 +32,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
+ --attention-backend unfused \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
index f181c8c2dd..dfa5a80c26 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
@@ -7,10 +7,6 @@ NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
-# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
-export NVTE_FLASH_ATTN=0
-export NVTE_FUSED_ATTN=0
-export NVTE_UNFUSED_ATTN=1
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="1"
@@ -37,6 +33,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
+ --attention-backend unfused \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
index 31ec192fd5..6e57972e30 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
@@ -7,11 +7,6 @@ NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
-# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
-export NVTE_FLASH_ATTN=0
-export NVTE_FUSED_ATTN=0
-export NVTE_UNFUSED_ATTN=1
-
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
@@ -36,6 +31,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--apply-layernorm-1p \
+ --attn-attention unfused \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--no-rope-fusion \
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
index 3eb02d2e1d..8469945f08 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
@@ -7,11 +7,6 @@ NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
-# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
-export NVTE_FLASH_ATTN=0
-export NVTE_FUSED_ATTN=0
-export NVTE_UNFUSED_ATTN=1
-
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
@@ -36,6 +31,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--untie-embeddings-and-output-weights \
+ --attention-backend unfused \
--disable-bias-linear \
--use-rotary-position-embeddings \
--rotary-percent 1.0 \
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
new file mode 100644
index 0000000000..d2a4edee47
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="fp8"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=1
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+ INFERENCE_TP="1"
+fi
+
+additional_options=" \
+ --export-quant-cfg ${QUANT_CFG} \
+ --export-legacy-megatron \
+ --export-te-mcore-model \
+ --calib-batch-size 8 \
+ --decoder ${DECODER_TYPE} \
+ --export-dir /tmp/trtllm_ckpt \
+ --inference-tensor-parallel ${INFERENCE_TP} "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+ --untie-embeddings-and-output-weights \
+ --no-masked-softmax-fusion \
+ --no-position-embedding \
+ --use-mcore-models \
+ --disable-bias-linear \
+ --rotary-percent 1.0 \
+ --attention-dropout 0.0 \
+ --hidden-dropout 0.0 \
+ --tensor-model-parallel-size ${TP} \
+ --pipeline-model-parallel-size 1 \
+ --num-layers 32 \
+ --hidden-size 4096 \
+ --ffn-hidden-size 14336 \
+ --num-attention-heads 32 \
+ --seq-length 4096 \
+ --kv-channels 128 \
+ --normalization RMSNorm \
+ --swiglu \
+ --num-query-groups 8 \
+ --num-experts 8 \
+ --moe-router-topk 2 \
+ --moe-aux-loss-coeff 1e-2 \
+ --moe-router-load-balancing-type aux_loss \
+ --group-query-attention \
+ --position-embedding-type rope \
+ --no-rope-fusion \
+ --max-position-embeddings 32768 \
+ --micro-batch-size 1 \
+ --tokenizer-type HuggingFaceTokenizer \
+ --tiktoken-pattern v2 \
+ --tokenizer-model mistralai/Mixtral-8x7B-Instruct-v0.1 \
+ --save-interval 1000000 \
+ --load ${CHECKPOINT_LOAD_DIR} \
+ --bf16 \
+ --rotary-base 1000000 \
+ --use-dist-ckpt"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
+
+
diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
index 340c9c90f7..c915cec790 100644
--- a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
+++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
@@ -6,7 +6,7 @@
import sys
from pathlib import Path
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
import modelopt.torch.quantization as mtq
import torch
@@ -120,6 +120,9 @@ def get_calib_dataloader(
print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
args.exit_on_missing_checkpoint = True
+ if hasattr(args, 'moe_grouped_gemm') and args.moe_grouped_gemm == True:
+ print_rank_0("WARNING: Forcing moe_grouped_gemm to False for PTQ and export.")
+ args.moe_grouped_gemm = False
# Set up model and load checkpoint
# [ModelOpt]: make sure that output logits are allgathered.
@@ -168,7 +171,7 @@ def hf_dataset_forword_loop_func(model):
model,
prompts=prompts,
tokens_to_generate=0,
- return_output_log_probs=True,
+ return_output_log_probs=False,
temperature=1.0,
)
else:
@@ -216,3 +219,4 @@ def hf_dataset_forword_loop_func(model):
)
print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
+ torch.distributed.barrier()
diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh
index b164ae2e91..7d2c01b315 100755
--- a/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
@@ -31,6 +31,7 @@ GPT_MODEL_ARGS=(
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
+ --attention-backend auto # Can use (flash/fused/unfused/local)
)
TRAINING_ARGS=(
diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile
index 0ea6edda3f..7b54091ae6 100644
--- a/examples/multimodal/Dockerfile
+++ b/examples/multimodal/Dockerfile
@@ -10,17 +10,17 @@ RUN apt update && \
bash \
git \
vim \
+ tmux \
python-is-python3 \
default-jre
RUN pip install --upgrade pip
-RUN pip install einops einops-exts sentencepiece braceexpand webdataset
-RUN pip install transformers datasets
+RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging
+RUN pip install transformers datasets accelerate timm
RUN pip install pytest-cov pytest_mock nltk wrapt
RUN pip install zarr "tensorstore==0.1.45"
-RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main
RUN pip install black isort click==8.0.2
-RUN pip install pycocoevalcap megatron-energon
+RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken
RUN pip install git+https://github.com/openai/CLIP.git
# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
-RUN pip install open-flamingo[eval] --no-deps
+RUN pip install open_clip_torch open-flamingo[eval] --no-deps
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index 00be3b46b0..62e47567b9 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -16,14 +16,15 @@ You can build a docker container using `examples/multimodal/Dockerfile` to run t
### Language model
-Follow the instructions in `megatron-lm/docs/llama_mistral.md` to download weights for Mistral-7B-Instruct-v0.3 and convert to mcore format with tensor parallel size 4
+Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 (Base or Instruct) from HuggingFace and convert to mcore format with tensor parallel size 4.
+Please use the tokenizer from HuggingFace.
### Vision model
This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following:
```
-python examples/multimodal/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4 --use-te
+python examples/multimodal/model_converter/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4 --use-te
```
### Combined model checkpoint
@@ -31,7 +32,7 @@ python examples/multimodal/clip_converter.py --download-root /some/download/fold
Update the paths to point to the mcore converted CLIP and Mistral models and run the following script to combine the Mistral and CLIP models into a single multimodal checkpoint folder:
```
-examples/multimodal/combine_mistral_clip.sh /path/to/mistral/model /path/to/clip/model /output/dir
+examples/multimodal/combine_lm_vision_checkpoints.sh /path/to/mistral/model /path/to/clip/model /output/dir
```
## Training
@@ -57,7 +58,7 @@ examples/multimodal/combine_mistral_clip.sh /path/to/mistral/model /path/to/clip
```
cd /wds
- energon ./
+ energon prepare ./
```
select the following values for the presented options:
@@ -112,7 +113,7 @@ Run the following script:
```
examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
- --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file --task generation-task-name
+ --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer/ --gt-path /path/to/groundtruth/file --task generation-task-name
```
where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning` or `MMMU`.
diff --git a/examples/multimodal/combine_lm_vision_checkpoints.sh b/examples/multimodal/combine_lm_vision_checkpoints.sh
new file mode 100755
index 0000000000..52de16ecd2
--- /dev/null
+++ b/examples/multimodal/combine_lm_vision_checkpoints.sh
@@ -0,0 +1,57 @@
+#/bin/bash
+MCORE_LM=$1 #
+MCORE_VISION=$2 #
+OUTPUT_DIR=$3 #
+MODEL_TYPE=$4 # Model type. Default: Mistral CLIP example.
+
+if [[ $MODEL_TYPE == "nvlm" ]]; then
+ # NVLM TP=8
+ python examples/multimodal/combine_state_dicts.py \
+ --input \
+ ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+ --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+ --output \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
+else
+ # Mistral CLIP example TP=4.
+ python examples/multimodal/combine_state_dicts.py \
+ --input \
+ ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+ ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+ ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+ --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+ --output \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+ ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt
+fi
+
+echo 1 > ${OUTPUT_DIR}/latest_checkpointed_iteration.txt
diff --git a/examples/multimodal/combine_mistral_clip.sh b/examples/multimodal/combine_mistral_clip.sh
deleted file mode 100755
index ff866c7f72..0000000000
--- a/examples/multimodal/combine_mistral_clip.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#/bin/bash
-MCORE_MISTRAL=$1 #
-MCORE_CLIP=$2 #
-OUTPUT_DIR=$3 #
-
-python examples/multimodal/combine_state_dicts.py \
- --input \
- ${MCORE_MISTRAL}/iter_0000001/mp_rank_00/model_optim_rng.pt \
- ${MCORE_CLIP}/iter_0000001/mp_rank_00/model_optim_rng.pt \
- ${MCORE_MISTRAL}/iter_0000001/mp_rank_01/model_optim_rng.pt \
- ${MCORE_CLIP}/iter_0000001/mp_rank_01/model_optim_rng.pt \
- ${MCORE_MISTRAL}/iter_0000001/mp_rank_02/model_optim_rng.pt \
- ${MCORE_CLIP}/iter_0000001/mp_rank_02/model_optim_rng.pt \
- ${MCORE_MISTRAL}/iter_0000001/mp_rank_03/model_optim_rng.pt \
- ${MCORE_CLIP}/iter_0000001/mp_rank_03/model_optim_rng.pt \
- --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
- --output \
- ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_00/model_optim_rng.pt \
- ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_01/model_optim_rng.pt \
- ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_02/model_optim_rng.pt \
- ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_03/model_optim_rng.pt
-
-echo 1 > ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/latest_checkpointed_iteration.txt
diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
index cf48b131a7..ee404604b6 100644
--- a/examples/multimodal/config.py
+++ b/examples/multimodal/config.py
@@ -7,34 +7,33 @@
def get_language_model_config(config):
- if config.language_model_type == "2b":
+ if config.language_model_type == "llama3_8b":
+ config.activation_func = torch.nn.functional.silu
config.add_bias_linear = False
config.bias_activation_fusion = False
config.gated_linear_unit = True
- config.apply_query_key_layer_scaling = True
- config.layernorm_zero_centered_gamma = True
+ config.apply_query_key_layer_scaling = False
+ config.layernorm_zero_centered_gamma = (
+ False # Zero centered gamma not supported for RMSNorm
+ )
config.bias_dropout_fusion = False
- config.rotary_percent = 0.5
config.apply_rope_fusion = False
config.attention_softmax_in_fp32 = True
- elif config.language_model_type == "8b":
+ config.ffn_hidden_size = 14336
+ elif config.language_model_type == "mistral_7b":
+ config.activation_func = torch.nn.functional.silu
config.add_bias_linear = False
config.bias_activation_fusion = False
- config.gated_linear_unit = False
- config.apply_query_key_layer_scaling = True
- config.layernorm_zero_centered_gamma = True
+ config.gated_linear_unit = True
+ config.apply_query_key_layer_scaling = False
+ config.layernorm_zero_centered_gamma = (
+ False # Zero centered gamma not supported for RMSNorm
+ )
config.bias_dropout_fusion = False
- config.rotary_percent = 0.5
- config.attention_dropout = 0.0
config.apply_rope_fusion = False
- config.activation_func = squared_relu
- config.ffn_hidden_size = 16384
- config.masked_softmax_fusion = True
config.attention_softmax_in_fp32 = True
- config.num_query_groups = 32
- config.kv_channels = 128
- config.rotary_interleaved = False
- elif config.language_model_type == "llama3_8b":
+ config.ffn_hidden_size = 14336
+ elif config.language_model_type == "yi-34b":
config.activation_func = torch.nn.functional.silu
config.add_bias_linear = False
config.bias_activation_fusion = False
@@ -46,10 +45,11 @@ def get_language_model_config(config):
config.bias_dropout_fusion = False
config.apply_rope_fusion = False
config.attention_softmax_in_fp32 = True
- config.ffn_hidden_size = 14336
- elif config.language_model_type == "mistral_7b":
+ config.ffn_hidden_size = 20480
+ elif config.language_model_type == "qwen2.5_7B":
config.activation_func = torch.nn.functional.silu
config.add_bias_linear = False
+ config.add_qkv_bias = True
config.bias_activation_fusion = False
config.gated_linear_unit = True
config.apply_query_key_layer_scaling = False
@@ -59,7 +59,23 @@ def get_language_model_config(config):
config.bias_dropout_fusion = False
config.apply_rope_fusion = False
config.attention_softmax_in_fp32 = True
- config.ffn_hidden_size = 14336
+ config.ffn_hidden_size = 18944
+ elif config.language_model_type == "qwen2.0_72B":
+ config.activation_func = torch.nn.functional.silu
+ config.add_bias_linear = False
+ config.add_qkv_bias = True
+ config.bias_activation_fusion = False
+ config.gated_linear_unit = True
+ config.apply_query_key_layer_scaling = False
+ config.layernorm_zero_centered_gamma = (
+ False # Zero centered gamma not supported for RMSNorm
+ )
+ config.bias_dropout_fusion = False
+ config.apply_rope_fusion = False
+ config.attention_softmax_in_fp32 = True
+ config.ffn_hidden_size = 29568
+ else:
+ raise ValueError(f"unknown language model type {config.language_model_type}")
return config
@@ -107,6 +123,29 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
config.apply_rope_fusion = False
config.qk_layernorm = False
config.layernorm_epsilon = 1e-6
+ elif config.vision_model_type == "internvit":
+ config.num_layers = 45
+ config.num_attention_heads = 32 # Padded for TP=8.
+ config.num_query_groups = 32 # Padded for TP=8.
+ config.kv_channels = 128
+ config.add_bias_linear = True
+ config.add_qkv_bias = False
+ config.hidden_size = 3200
+ config.hidden_dropout = 0.0
+ config.attention_dropout = 0.0
+ config.ffn_hidden_size = 12800
+ config.gated_linear_unit = False
+ config.activation_func = torch.nn.functional.gelu
+ config.layernorm_zero_centered_gamma = False
+ config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+ config.bias_activation_fusion = False
+ config.bias_dropout_fusion = False
+ config.attention_softmax_in_fp32 = True
+ config.normalization = 'RMSNorm'
+ config.layernorm_epsilon = 1e-6
+ config.apply_rope_fusion = False
+ else:
+ raise ValueError(f"unknown vision model type {config.vision_model_type}")
return config
@@ -116,18 +155,26 @@ def get_vision_projection_config(config, hidden_size):
config.bias_activation_fusion = False
config.add_bias_linear = False
config.hidden_size = hidden_size # Used as the vision projection output size, i.e., the input to the language model.
- if config.language_model_type == "2b":
- config.ffn_hidden_size = 5440
- config.activation_func = torch.nn.functional.gelu
- if config.language_model_type == "8b":
- config.ffn_hidden_size = 16384
- config.activation_func = squared_relu
- elif config.language_model_type == "llama3_8b":
+ if config.language_model_type == "llama3_8b":
config.ffn_hidden_size = 14336
config.activation_func = torch.nn.functional.gelu
elif config.language_model_type == "mistral_7b":
config.ffn_hidden_size = 14336
config.activation_func = torch.nn.functional.gelu
+ config.normalization = None
+ elif config.language_model_type == "yi-34b":
+ config.ffn_hidden_size = 20480
+ config.normalization = "LayerNorm"
+ config.activation_func = torch.nn.functional.gelu
+ elif config.language_model_type == "qwen2.5_7B":
+ config.ffn_hidden_size = 3584
+ config.activation_func = torch.nn.functional.gelu
+ elif config.language_model_type == "qwen2.0_72B":
+ config.ffn_hidden_size = 29568
+ config.normalization = "LayerNorm"
+ config.activation_func = torch.nn.functional.gelu
+ else:
+ raise ValueError(f"unknown language model type {config.language_model_type}")
return config
@@ -151,5 +198,3 @@ class EvaluationConfig:
num_partitions: int = 1
partition_id: int = 0
num_samples_per_partition: int = 0
-
- prompt_format: str = "mistral"
diff --git a/examples/multimodal/conversation.py b/examples/multimodal/conversation.py
deleted file mode 100644
index 5139d20335..0000000000
--- a/examples/multimodal/conversation.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# From https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/conversation.py
-
-import dataclasses
-from enum import auto, Enum
-from typing import List
-
-
-class SeparatorStyle(Enum):
- """Different separator style."""
- SINGLE = auto()
- TWO = auto()
- MPT = auto()
- PLAIN = auto()
- LLAMA_2 = auto()
-
-
-@dataclasses.dataclass
-class Conversation:
- """A class that keeps all conversation history."""
- system: str
- roles: List[str]
- messages: List[List[str]]
- offset: int
- sep_style: SeparatorStyle = SeparatorStyle.SINGLE
- sep: str = "###"
- sep2: str = None
- real_sep2: str = None
- version: str = "Unknown"
-
- skip_next: bool = False
-
- def get_prompt(self):
- messages = self.messages
- if len(messages) > 0 and type(messages[0][1]) is tuple:
- messages = self.messages.copy()
- init_role, init_msg = messages[0].copy()
- init_msg = init_msg[0].replace("", "").strip()
- if 'mmtag' in self.version:
- messages[0] = (init_role, init_msg)
- messages.insert(0, (self.roles[0], ""))
- messages.insert(1, (self.roles[1], "Received."))
- else:
- messages[0] = (init_role, "\n" + init_msg)
-
- if self.sep_style == SeparatorStyle.SINGLE:
- ret = self.system + self.sep
- for role, message in messages:
- if message:
- if type(message) is tuple:
- message, _, _ = message
- ret += role + ": " + message + self.sep
- else:
- ret += role + ":"
- elif self.sep_style == SeparatorStyle.TWO:
- seps = [self.sep, self.sep2]
- ret = self.system + seps[0]
- for i, (role, message) in enumerate(messages):
- if message:
- if type(message) is tuple:
- message, _, _ = message
- ret += role + ": " + message + seps[i % 2]
- else:
- ret += role + ":"
- elif self.sep_style == SeparatorStyle.MPT:
- ret = self.system + self.sep
- for role, message in messages:
- if message:
- if type(message) is tuple:
- message, _, _ = message
- ret += role + message + self.sep
- else:
- ret += role
- elif self.sep_style == SeparatorStyle.LLAMA_2:
- wrap_sys = lambda msg: f"<>\n{msg}\n<>\n\n"
- wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
- ret = ""
-
- for i, (role, message) in enumerate(messages):
- if i == 0:
- assert message, "first message should not be none"
- assert role == self.roles[0], "first message should come from user"
- if message:
- if type(message) is tuple:
- message, _, _ = message
- if i == 0: message = wrap_sys(self.system) + message
- if i % 2 == 0:
- message = wrap_inst(message)
- ret += self.sep + message
- else:
- ret += " " + message + " " + self.sep2
- else:
- ret += ""
- ret = ret.lstrip(self.sep)
- elif self.sep_style == SeparatorStyle.PLAIN:
- seps = [self.sep, self.sep2]
- ret = self.system
- for i, (role, message) in enumerate(messages):
- if message:
- if type(message) is tuple:
- message, _, _ = message
- ret += message + seps[i % 2]
- else:
- ret += ""
- else:
- raise ValueError(f"Invalid style: {self.sep_style}")
-
- return ret
-
- def append_message(self, role, message):
- self.messages.append([role, message])
-
- def get_images(self, return_pil=False):
- images = []
- for i, (role, msg) in enumerate(self.messages[self.offset:]):
- if i % 2 == 0:
- if type(msg) is tuple:
- import base64
- from io import BytesIO
- from PIL import Image
- msg, image, image_process_mode = msg
- if image_process_mode == "Pad":
- def expand2square(pil_img, background_color=(122, 116, 104)):
- width, height = pil_img.size
- if width == height:
- return pil_img
- elif width > height:
- result = Image.new(pil_img.mode, (width, width), background_color)
- result.paste(pil_img, (0, (width - height) // 2))
- return result
- else:
- result = Image.new(pil_img.mode, (height, height), background_color)
- result.paste(pil_img, ((height - width) // 2, 0))
- return result
- image = expand2square(image)
- elif image_process_mode in ["Default", "Crop"]:
- pass
- elif image_process_mode == "Resize":
- image = image.resize((336, 336))
- else:
- raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
- max_hw, min_hw = max(image.size), min(image.size)
- aspect_ratio = max_hw / min_hw
- max_len, min_len = 800, 400
- shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
- longest_edge = int(shortest_edge * aspect_ratio)
- W, H = image.size
- if longest_edge != max(image.size):
- if H > W:
- H, W = longest_edge, shortest_edge
- else:
- H, W = shortest_edge, longest_edge
- image = image.resize((W, H))
- if return_pil:
- images.append(image)
- else:
- buffered = BytesIO()
- image.save(buffered, format="PNG")
- img_b64_str = base64.b64encode(buffered.getvalue()).decode()
- images.append(img_b64_str)
- return images
-
- def to_gradio_chatbot(self):
- ret = []
- for i, (role, msg) in enumerate(self.messages[self.offset:]):
- if i % 2 == 0:
- if type(msg) is tuple:
- import base64
- from io import BytesIO
- msg, image, image_process_mode = msg
- max_hw, min_hw = max(image.size), min(image.size)
- aspect_ratio = max_hw / min_hw
- max_len, min_len = 800, 400
- shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
- longest_edge = int(shortest_edge * aspect_ratio)
- W, H = image.size
- if H > W:
- H, W = longest_edge, shortest_edge
- else:
- H, W = shortest_edge, longest_edge
- image = image.resize((W, H))
- buffered = BytesIO()
- image.save(buffered, format="JPEG")
- img_b64_str = base64.b64encode(buffered.getvalue()).decode()
- img_str = f'
'
- msg = img_str + msg.replace('', '').strip()
- ret.append([msg, None])
- else:
- ret.append([msg, None])
- else:
- ret[-1][-1] = msg
- return ret
-
- def copy(self):
- return Conversation(
- system=self.system,
- roles=self.roles,
- messages=[[x, y] for x, y in self.messages],
- offset=self.offset,
- sep_style=self.sep_style,
- sep=self.sep,
- sep2=self.sep2,
- real_sep2=self.real_sep2,
- version=self.version)
-
- def dict(self):
- if len(self.get_images()) > 0:
- return {
- "system": self.system,
- "roles": self.roles,
- "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
- "offset": self.offset,
- "sep": self.sep,
- "sep2": self.sep2,
- "real_sep2": self.real_sep2
- }
- return {
- "system": self.system,
- "roles": self.roles,
- "messages": self.messages,
- "offset": self.offset,
- "sep": self.sep,
- "sep2": self.sep2,
- "real_sep2": self.real_sep2
- }
-
-
-conv_mpt = Conversation(
- system="""<|im_start|>system
-A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
- roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
- version="mpt",
- messages=(),
- offset=0,
- sep_style=SeparatorStyle.MPT,
- sep="<|im_end|>",
-)
-
-
-### Used for llava-pretraining
-conv_llava_plain = Conversation(
- system="",
- roles=("", ""),
- messages=(
- ),
- offset=0,
- sep_style=SeparatorStyle.PLAIN,
- sep="\n",
-)
-
-conv_llava_v0 = Conversation(
- system="A chat between a curious human and an artificial intelligence assistant. "
- "The assistant gives helpful, detailed, and polite answers to the human's questions.",
- roles=("Human", "Assistant"),
- messages=(
- ),
- offset=0,
- sep_style=SeparatorStyle.SINGLE,
- sep="###",
-)
-
-conv_llava_v0_mmtag = Conversation(
- system="A chat between a curious user and an artificial intelligence assistant. "
- "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
- "The visual content will be provided with the following format: visual content.",
- roles=("Human", "Assistant"),
- messages=(
- ),
- offset=0,
- sep_style=SeparatorStyle.SINGLE,
- sep="###",
- version="v0_mmtag",
-)
-
-conv_llava_v1 = Conversation(
- system="A chat between a curious human and an artificial intelligence assistant. "
- "The assistant gives helpful, detailed, and polite answers to the human's questions.",
- roles=("USER", "ASSISTANT"),
- version="v1",
- messages=(),
- offset=0,
- sep_style=SeparatorStyle.TWO,
- sep=" ",
- sep2="",
-)
-
-conv_llava_v1_mmtag = Conversation(
- system="A chat between a curious user and an artificial intelligence assistant. "
- "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
- "The visual content will be provided with the following format: visual content.",
- roles=("USER", "ASSISTANT"),
- messages=(),
- offset=0,
- sep_style=SeparatorStyle.TWO,
- sep=" ",
- sep2="",
- version="v1_mmtag",
-)
-
-chatqa_sft = Conversation(
- system="System: This is a chat between a user and an artificial intelligence assistant. "
- "The assistant gives helpful, detailed, and polite answers to the user's questions.",
- roles=("User", "Assistant"),
- version="chatqa",
- messages=(),
- offset=0,
- sep_style=SeparatorStyle.TWO,
- sep="\n\n",
- sep2="\n\n",
- real_sep2="\n\n"
-)
-
-conv_chatml = Conversation(
- system="""<|im_start|>system
-Answer the questions.""",
- roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
- version="mpt",
- messages=(),
- offset=0,
- sep_style=SeparatorStyle.MPT,
- sep="<|im_end|>",
-)
-
-mistral_instruct = Conversation(
- system="",
- roles=("user", "assistant"),
- version="mpt",
- messages=(),
- offset=0,
- sep_style=SeparatorStyle.LLAMA_2,
- sep="",
- sep2="",
-)
-
-llama3_instruct = Conversation(
- system="<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.",
- roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
- version="mpt",
- messages=(),
- offset=0,
- sep_style=SeparatorStyle.MPT,
- sep="<|eot_id|>",
-)
-
-conv_templates = {
- "plain": conv_llava_plain,
- "v0_plain": conv_llava_plain,
- "llava_v0": conv_llava_v0,
- "v0_mmtag": conv_llava_v0_mmtag,
- "llava_v1": conv_llava_v1,
- "v1_mmtag": conv_llava_v1_mmtag,
-
- "mpt": conv_mpt,
-}
diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py
index 4bd1b29e51..aef2186834 100644
--- a/examples/multimodal/dataloader_provider.py
+++ b/examples/multimodal/dataloader_provider.py
@@ -15,23 +15,24 @@
get_val_datasets,
)
from megatron.core.num_microbatches_calculator import get_num_microbatches
-from megatron.core.parallel_state import get_tensor_model_parallel_rank
-from megatron.training import get_args, print_rank_0
+from megatron.core.parallel_state import get_tensor_model_parallel_rank, get_pipeline_model_parallel_world_size, get_pipeline_model_parallel_rank
+from megatron.training import get_args
from megatron.training.checkpointing import get_checkpoint_name
def datasets_provider(worker_config=None):
"""Create multimodal train, validation and test datasets."""
args = get_args()
+
dname = args.data_path[0] if type(args.data_path) is list else args.data_path
train_dataset = get_train_dataset(
dname,
batch_size=args.micro_batch_size,
task_encoder=TaskEncoder(),
worker_config=worker_config,
- virtual_epoch_length=1000,
- max_samples_per_sequence=100,
- shuffle_buffer_size=100,
+ max_samples_per_sequence=None,
+ shuffle_buffer_size=None,
+ packing_buffer_size=args.packing_buffer_size,
handler=print_error_handler,
image_decode="pil",
)
@@ -43,6 +44,7 @@ def datasets_provider(worker_config=None):
# limit=args.eval_iters * get_num_microbatches(),
task_encoder=TaskEncoder(),
worker_config=worker_config,
+ packing_buffer_size=args.packing_buffer_size,
handler=print_error_handler,
image_decode="pil",
)
@@ -61,13 +63,44 @@ def datasets_provider(worker_config=None):
return train_dataset, val_datasets_without_source_datasets, None
+def is_first_or_last_stage(pp_size, encoder_pipeline_model_parallel_size):
+ """Check if the current pipeline parallel stage is the first or last stage."""
+ if pp_size == 1: # No pipeline parallelism.
+ return True
+
+ is_valid_rank = False
+ pp_rank = get_pipeline_model_parallel_rank()
+ if encoder_pipeline_model_parallel_size == 0:
+ # No separate pipeline stage for the vision model. Run the dataloader on the first and last pipeline stage.
+ is_valid_rank = pp_rank in (0, pp_size-1)
+ elif encoder_pipeline_model_parallel_size == 1:
+ # Separate pipeline stage for the vision model. Run the dataloader on the first vision and LM stage and last LM stage.
+ is_valid_rank = pp_rank in (0, 1, pp_size-1)
+ else:
+ raise NotImplementedError("encoder-pipeline-model-parallel-size > 1 is not supported yet")
+
+ return is_valid_rank
+
+
+def is_dataloader_rank(encoder_pipeline_model_parallel_size):
+ """Check if we should have the dataloader on this tensor and pipeline parallel rank."""
+ # Run dataloader only on the first tensor parallel rank (will be broadcasted to others).
+ is_first_rank = get_tensor_model_parallel_rank() == 0
+
+ pp_size = get_pipeline_model_parallel_world_size()
+ is_first_rank = is_first_rank and is_first_or_last_stage(pp_size, encoder_pipeline_model_parallel_size)
+
+ return is_first_rank
+
+
def train_valid_test_dataloaders_provider(train_val_test_num_samples):
"""Build multimodal train, validation and test dataloaders."""
- if get_tensor_model_parallel_rank() != 0:
- return None, None, None
-
args = get_args()
+ # Dataloader is only on specific ranks.
+ if not is_dataloader_rank(args.encoder_pipeline_model_parallel_size):
+ return None, None, None
+
worker_debug_path = None
worker_log_level = 0
@@ -92,15 +125,18 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
data_save_name = get_checkpoint_name(
args.dataloader_save,
args.iteration,
+ pipeline_rank=0, # Only the first pipeline parallel rank stores the dataloader checkpoint.
basename=f"train_dataloader_dprank{dp_rank:03d}.pt",
)
if os.path.exists(data_save_name):
try:
dataset_state_dict = torch.load(data_save_name, map_location="cpu")
train_dataloader.restore_state_rank(dataset_state_dict["dataloader_state_dict"])
- print_rank_0(f"restored dataset state from {data_save_name}")
+ print(f"restored dataset state from {data_save_name}")
except Exception as e:
- print_rank_0("loading dataloader checkpoint failed. Skipping. " + str(e))
+ print("loading dataset state failed. Skipping. " + str(e))
+ else:
+ print(f"dataset state {data_save_name} does not exist")
valid_dataloader = [
EnergonDataloader(get_loader(valid_ds, worker_config=worker_config))
diff --git a/examples/multimodal/dataset_helpers.py b/examples/multimodal/dataset_helpers.py
index 6468eef9bb..de76f8e45e 100644
--- a/examples/multimodal/dataset_helpers.py
+++ b/examples/multimodal/dataset_helpers.py
@@ -1,295 +1,239 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import bisect
import dataclasses
-import itertools
import json
-import random
-import re
import sys
import traceback
from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
from image_processing import get_visual_transform
-import conversation as conversation_lib
import numpy as np
import torch
-from PIL import Image, ImageDraw
-from torchvision import transforms as T
-from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
from megatron.energon import (
Batch,
CaptioningSample,
DefaultTaskEncoder,
OCRSample,
+ Sample,
SimilarityInterleavedSample,
VQASample,
+ MultiChoiceVQASample
)
-from megatron.energon.transforms import CustomTransform, MergeTransform
-from megatron.training import get_args
-from megatron.training.tokenizer import build_tokenizer
+from megatron.energon.task_encoder.base import stateless
+from megatron.training import get_args, get_tokenizer
-class RandomResize(CustomTransform):
- """Resizes the image by a random scale factor in the given interval, but at most max_size"""
+@dataclass
+class ImageTaskSample(Sample):
+ __key__: str
+ __restore_key__: Tuple[Union[str, int, tuple], ...]
+ __subflavor__: Dict
+ __subflavors__: Dict
+ # (c, h, w)
+ imgs: List[torch.Tensor]
+ num_tiles: List[int]
+ tokens: torch.Tensor
+ total_len: int # Total token count in the sample, including text and image tokens
+ labels: torch.Tensor = None
- def __init__(self, min_scale: float, max_scale: float, max_size: int):
- self._min_scale = min_scale
- self._max_scale = max_scale
- self._max_size = max_size
- def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
- scale = random.uniform(self._min_scale, self._max_scale)
- new_size = tuple(int(x * scale) for x in dst_size)
+@dataclass
+class ImageTaskSamplePacked(Sample):
+ """Dataclass to store a single packed sample (not a batch).
+
+ P = Number of sub-samples in the packed sample
+ seq_len = Total sequence length
+ num_imgs = Number of images across all samples in the packed sample
+ """
+
+ __key__: str # Sample name
+ __restore_key__: Tuple[Union[str, int, tuple], ...]
+ __subflavor__: Dict # Sample metadata. Deprecated.
+ __subflavors__: Dict # Sample metadata.
+ tokens: torch.Tensor # Input tokens packed into a single tensor (seq_len,)
+ labels: torch.Tensor # Target tokens packed into a single tensor (seq_len,)
+ imgs: List[torch.Tensor] # Input images
+ num_tiles: List[int] # Number of tiles for each image of each sample (num_imgs)
+ max_length: int # Maximum length across sub-samples.
+ cu_lengths: List[int] # Cumulative length of each sub-sample in this packed sample incl. text and image tokens (P,)
- if max(new_size) > self._max_size:
- scale = self._max_size / max(new_size)
- new_size = tuple(int(x * scale) for x in dst_size)
- matrix = self.scale(scale, scale) @ matrix
- dst_size = np.array(new_size, dtype=dst_size.dtype)
+# Typing for the resulting batch data after encode_batch()
+@dataclass
+class ImageTaskBatchPacked(Batch):
+ """Dataclass to store a batch of packed samples.
- return matrix, dst_size, (self.__class__.__name__, scale)
+ N = Batch size
+ P = Number of samples in the packed sample
+ seq_len = Maximum sequence length
+ num_imgs = Number of images across all samples in the packed sample
+ """
+ __key__: List[str] # Sample names
+ __restore_key__: Tuple[Union[str, int, tuple], ...]
+ __subflavor__: Dict # Sample metadata. Deprecated.
+ __subflavors__: List[Dict] # Sample metadatas.
+ tokens: torch.Tensor # Input tokens packed and padded (N, seq_len)
+ labels: torch.Tensor # Target tokens packed and padded (N, seq_len)
+ imgs: torch.Tensor # All image tiles stacked into a single tensor (num_tiles, C, H, W)
+ num_tiles: List[List[int]] # Number of tiles per image (N, num_imgs)
+ max_lengths: List[int] # Maximum length across sub-samples (N,)
+ cu_lengths: List[List[int]] # Cumulative length of each sub-sample in each packed sample of the batch (N, P)
-class RandomResizeLongEdge(CustomTransform):
- """Resizes the image's longer edge to a random length between min_size and max_size pixels."""
- def __init__(self, min_size: int, max_size: int):
- self._min_size = min_size
- self._max_size = max_size
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def search_for_fit(numbers: List[int], capacity: int) -> int:
+ """Finds the index of largest number that fits into the knapsack with the given capacity."""
+ index = bisect.bisect(numbers, capacity)
+ return -1 if index == 0 else (index - 1)
- def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
- new_long = random.randint(self._min_size, self._max_size)
- if dst_size[0] > dst_size[1]: # h > w
- new_w, new_h = int(new_long * dst_size[1] / dst_size[0]), new_long
- else: # w > h
- new_w, new_h = new_long, int(new_long * dst_size[0] / dst_size[1])
- new_size = (new_h, new_w)
- matrix = self.scale(new_w / dst_size[1], new_h / dst_size[0]) @ matrix
- dst_size = np.array(new_size, dtype=dst_size.dtype)
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: int) -> List:
+ """Greedy algorithm with binary search for the knapsack problem.
- return matrix, dst_size, (self.__class__.__name__, new_size)
+ Pack as many samples as possible given a maximum capacity and capacities of individual samples.
+ Used if sequence packing is enabled.
+ """
+ assert len(item_sizes) == len(samples), "sample lengths and samples must have the same length."
+ knapsacks = []
-class RandomPad(CustomTransform):
- """Pads the image to the given size, randomly choosing the position of the image within the new larger image.
- If the image is already larger than the given size, it will not be padded in that direction(s)."""
+ if len(item_sizes) == 0:
+ return knapsacks
- def __init__(self, size: Tuple[int, int]):
- self._new_size = size # h, w
+ # Sort sample lengths and samples together.
+ sorted_item_sizes, sorted_samples = zip(*sorted(zip(item_sizes, samples), key=lambda x: x[0]))
+ sorted_item_sizes = list(sorted_item_sizes)
+ sorted_samples = list(sorted_samples)
- def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
- h_pad = max(self._new_size[0] - dst_size[0], 0)
- w_pad = max(self._new_size[1] - dst_size[1], 0)
+ # Check if all samples fit in the knapsack capacity.
+ if sorted_item_sizes[-1] > max_capacity:
+ raise ValueError(f"knapsack: A sample is larger {sorted_item_sizes[-1]} than the max_sequence_length {max_capacity}.")
- if h_pad == 0 and w_pad == 0:
- return matrix, dst_size, (self.__class__.__name__, None)
- else:
- # TODO: fix me
- # top = random.randint(0, h_pad)
- # left = random.randint(0, w_pad)
- top = 0
- left = 0
-
- matrix = self.translate(left, top) @ matrix
- dst_size = np.array(self._new_size, dtype=dst_size.dtype)
- return matrix, dst_size, (self.__class__.__name__, (top, left))
-
-
-def _get_ocr_document_visual_transform(IMG_H=1024, IMG_W=1024):
- document_visual_transform = T.Compose(
- [
- MergeTransform(
- [
- # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)),
- RandomResizeLongEdge(960, 1008), # Note: 1008 comes from list(range(960, 1024, 16))[-1]
- T.RandomRotation(5, interpolation=T.InterpolationMode.BILINEAR),
- T.RandomPerspective(distortion_scale=0.1, p=0.1),
- RandomPad((IMG_H, IMG_W)),
- ]
- ),
- T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)),
- T.RandomGrayscale(p=0.5),
- T.RandomInvert(p=0.5),
- T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5),
- T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
- # LogImage(),
- # T.ToTensor(),
- # T.Normalize(IMAGE_MEAN, IMAGE_STD),
- ]
- )
- return document_visual_transform
-
-def _get_ocr_document_identity_transform(IMG_H=1024, IMG_W=1024):
- long_edge = max(IMG_H, IMG_W)
- document_identity_transform = T.Compose(
- [
- MergeTransform(
- [
- RandomResizeLongEdge(long_edge, long_edge),
- RandomPad((long_edge, long_edge)),
- ]
- )
- ]
- )
- return document_identity_transform
-
-def _get_ocr_paragraph_visual_transform(IMG_H=1024, IMG_W=1024):
- paragraph_visual_transform = T.Compose(
- [
- MergeTransform(
- [
- # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)),
- RandomResize(0.5, 2.0, min(IMG_H, IMG_W)), #FINAL_SIZE),
- T.RandomRotation(1, interpolation=T.InterpolationMode.BILINEAR),
- T.RandomPerspective(distortion_scale=0.1, p=0.1),
- RandomPad((IMG_H, IMG_W)),
- ]
- ),
- T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)),
- T.RandomGrayscale(p=0.5),
- T.RandomInvert(p=0.5),
- # T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5),
- # T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
- # LogImage(),
- # T.ToTensor(),
- # T.Normalize(IMAGE_MEAN, IMAGE_STD),
- ]
- )
- return paragraph_visual_transform
+ while sorted_item_sizes:
+ current_knapsack = []
+ remaining_capacity = max_capacity
-# Type for intermediate batch, after batch()
-@dataclass
-class ImageTaskSample:
- __key__: str
- __subflavors__: Dict
- # (c, h, w)
- imgs: List[torch.Tensor]
- num_tiles: List[int]
- text: np.ndarray
- prompt_len: np.int64
- target: torch.Tensor = None
+ while True:
+ idx = search_for_fit(sorted_item_sizes, remaining_capacity)
+ if idx == -1:
+ break # Can't fit more samples.
+ remaining_capacity -= sorted_item_sizes[idx]
-# Typing for the resulting batch data after encode_batch()
-@dataclass
-class ImageTaskBatch(Batch):
- __keys__: List[str]
- __subflavors__: List[Dict]
- # (num_tiles, c, h, w)
- imgs: torch.Tensor
- num_tiles: List[int]
- # (n, seq_len)
- text: torch.Tensor
- # (n, 1)
- prompt_len: torch.Tensor
- # (n, seq_len)
- target: torch.Tensor
-
-class IdentitySplitter(object):
- def tokenize(self, *text):
- return text
-
-class Tokenizer:
- def __init__(self):
-
- args = get_args()
- self.args = args
-
- self.initializer()
-
- def initializer(self):
- # Use Encoder class as a container for global data
- Tokenizer.tokenizer = build_tokenizer(self.args)
- if hasattr(Tokenizer.tokenizer, 'eod'):
- self.eod_token = Tokenizer.tokenizer.eod
- elif hasattr(Tokenizer.tokenizer, 'eos_id'):
- self.eod_token = Tokenizer.tokenizer.eos_id
- else:
- raise AttributeError('No eod token found in Tokenizer')
- self.split_token = 313131
-
- if (
- hasattr(self.args, "split_sentences") and self.args.split_sentences
- ): # default false
- if not nltk_available:
- print("NLTK is not available to split sentences.")
- exit()
- library = "tokenizers/punkt/{}.pickle".format("english")
- # print("loading: " + library)
- splitter = nltk.load(library)
- if self.args.keep_newlines:
- # this prevents punkt from eating newlines after sentences
- Tokenizer.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
- train_text=splitter._params, lang_vars=CustomLanguageVars()
- )
- else:
- Tokenizer.splitter = splitter
- else:
- Tokenizer.splitter = IdentitySplitter()
+ sorted_item_sizes.pop(idx)
+ sample = sorted_samples.pop(idx)
+ current_knapsack.append(sample)
+
+ knapsacks.append(current_knapsack)
- def __call__(self, text: str, padded: bool = True): # -> torch.Tensor:
- sentence = Tokenizer.splitter.tokenize(text)[0]
- sentence = Tokenizer.tokenizer.tokenize(sentence)
- return sentence
+ return knapsacks
-class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatch, dict]):
- """A simple task encoder for captioning."""
+class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked, dict]):
+ """A simple task encoder for VLMs."""
def __init__(
self
):
- # Specify the batch_type for default batching (batching is performed here "manually" by
- # overwriting the `batch` method)
super().__init__()
self.args = get_args()
- self.tokenizer = Tokenizer()
- self.manual_prompts = json.load(open(self.args.prompt_path))
- self.seq_len = self.args.dataloader_seq_length
+ self.tokenizer = get_tokenizer()
+ with open(self.args.prompt_path, "r") as f:
+ self.manual_prompts = json.load(f)
+ self.dataloader_seq_length = self.args.dataloader_seq_length # Always return samples of this length.
+ self.packing_seq_length = self.args.packing_seq_length # Packing sequence length, if packing is enabled.
+ self.is_packing_enabled = self.args.packing_buffer_size is not None and self.args.packing_buffer_size > 0
+
+ if self.dataloader_seq_length and self.packing_seq_length:
+ assert self.dataloader_seq_length >= self.packing_seq_length, "dataloader sequence length must be greater than or equal to the packing sequence length"
+
+ if self.is_packing_enabled:
+ assert self.packing_seq_length > 0, "packing sequence length must be set"
+
+ self.num_image_embeddings_per_tile = get_num_image_embeddings(
+ self.args.img_h,
+ self.args.img_w,
+ self.args.patch_dim,
+ self.args.vision_model_type,
+ self.args.disable_vision_class_token,
+ 1,
+ self.args.pixel_shuffle,
+ self.args.use_tile_tags,
+ )
self.txt_to_token_dict = {}
self.img_h, self.img_w = self.args.img_h, self.args.img_w
- self.ocr_document_visual_transform = _get_ocr_document_visual_transform(self.img_h, self.img_w)
- self.ocr_document_identity_transform = _get_ocr_document_identity_transform(self.img_h, self.img_w)
- self.ocr_paragraph_visual_transform = _get_ocr_paragraph_visual_transform(self.img_h, self.img_w)
+ def _get_total_seq_length(self, input_ids, num_tiles):
+ """Calculate expected sequence length given text tokens length and number of tiles."""
+ total_num_images = len(num_tiles)
+ total_num_tiles = sum(num_tiles)
+ total_len = len(input_ids) + total_num_tiles * self.num_image_embeddings_per_tile - total_num_images
+ return total_len
+
+ def _truncate_for_packing(self, input_ids, target, num_tiles):
+ """Truncate tokens and labels if they exceed packing sequence length."""
+ total_num_images = len(num_tiles)
+ total_num_tiles = sum(num_tiles)
+ total_img_embeddings_len = total_num_tiles * self.num_image_embeddings_per_tile
+ max_text_tokens = self.packing_seq_length - total_img_embeddings_len + total_num_images
+
+ input_ids = input_ids[:max_text_tokens]
+ target = target[:max_text_tokens]
+
+ # If truncate causes all labels to be ignored, then skip the sample
+ if (target == IGNORE_INDEX).all():
+ raise ValueError(f"all targets will be ignored after truncation: {input_ids}")
+
+ return input_ids, target
+ @stateless(restore_seeds=True)
def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQASample, SimilarityInterleavedSample]):
if isinstance(sample, OCRSample):
- yield self.encode_ocr(sample)
+ if "pdfa" in sample.__key__:
+ yield self.combined_ocr_encoder(sample, task_type='encode_pdf')
+ elif "multi" in sample.__key__:
+ yield self.combined_ocr_encoder(sample, task_type='_encode_ocr')
+ else:
+ yield self.combined_ocr_encoder(sample, task_type='encode_ocr_ref')
elif isinstance(sample, CaptioningSample):
yield self.encode_captioning(sample)
elif isinstance(sample, VQASample):
- is_llava_training = sample.__subflavors__['is_llava_training'] if 'is_llava_training' in sample.__subflavors__ else False
+ is_llava_training = sample.__subflavors__["is_llava_training"] if "is_llava_training" in sample.__subflavors__ else False
if "llava" in sample.__key__ or is_llava_training:
yield self.encode_llava_pretrain(sample)
else:
- yield self.encode_vqa(sample)
+ yield self.encode_any_single_turn_vqa(sample)
elif isinstance(sample, SimilarityInterleavedSample):
- if "llava" or "video" in sample.__key__:
- yield self.encode_llava_sft(sample)
- else:
- raise NotImplementedError('Sample format not supported')
+ yield self.encode_llava_sft(sample)
+ elif isinstance(sample, MultiChoiceVQASample):
+ yield self.encode_any_single_turn_vqa(sample)
else:
- raise NotImplementedError('Sample format not supported')
+ raise NotImplementedError("Sample format not supported", sample)
def encode_captioning(self, sample: CaptioningSample):
+ """Encode CaptioningSample."""
augment = sample.__subflavors__.get("augmentation")
- conv_format = sample.__subflavors__['conv_format'] if 'conv_format' in sample.__subflavors__ else 'mistral'
imgs = get_visual_transform(
sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+ self.args.vision_model_type,
)
num_tiles = [len(imgs)]
- prompt_list = self.manual_prompts["CaptioningPretraining"]["llava"]
+ prompt_list = self.manual_prompts["CaptioningPretraining"]["raw"]
prompt_idx = np.random.randint(len(prompt_list))
cur_prompt = prompt_list[prompt_idx]
@@ -302,89 +246,71 @@ def encode_captioning(self, sample: CaptioningSample):
caption_list = caption.split('\n')
caption = np.random.choice(caption_list)
- if conv_format == 'llama3_sft':
- conv = conversation_lib.llama3_instruct.copy()
- sep = conv.sep
- elif conv_format == "mistral":
- conv = conversation_lib.mistral_instruct.copy()
- conv = conv.sep2
-
- conversation = cur_prompt + caption + sep
+ conv = [
+ # Note: no system message.
+ {"role": "user", "content": cur_prompt},
+ {"role": "assistant", "content": caption},
+ ]
- input_ids = np.array(tokenizer_image_token(self.args, conversation, self.tokenizer, has_image=True))
- target = input_ids.copy()
+ input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
- prompt_len = len(tokenizer_image_token(self.args, cur_prompt, self.tokenizer))
- target[:prompt_len] = IGNORE_INDEX
+ if self.is_packing_enabled:
+ input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
return ImageTaskSample(
__key__=sample.__key__,
+ __restore_key__=sample.__restore_key__,
+ __subflavor__=None,
__subflavors__=sample.__subflavors__,
imgs=imgs,
num_tiles=num_tiles,
- text=input_ids,
- prompt_len=prompt_len,
- target=target,
+ tokens=torch.tensor(input_ids),
+ labels=torch.tensor(target),
+ total_len=self._get_total_seq_length(input_ids, num_tiles),
)
def encode_llava_pretrain(self, sample: VQASample):
- augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
- use_chat_format = sample.__subflavors__['use_chat_format'] if 'use_chat_format' in sample.__subflavors__ else False
- conv_format = sample.__subflavors__['conv_format'] if 'conv_format' in sample.__subflavors__ else "mistral"
+ """Encode pretrain sample in LLAVA style."""
+ augment = sample.__subflavors__.get("augmentation", False)
imgs = get_visual_transform(
sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+ self.args.vision_model_type,
)
num_tiles = [len(imgs)]
- assert "" in sample.context
- has_image = True
-
- if use_chat_format:
- prompt_idx = np.random.randint(len(self.manual_prompts["Captioning"]["raw"]))
- prompt = self.manual_prompts["Captioning"]["raw"][prompt_idx]
-
- sample.context = "User: " + "\n" + prompt + " Assistant: "
- conversation = sample.context + sample.answers + conversation_lib.mistral_instruct.sep
- else:
- # LLAVA training: override text-prompt with just IMAGE_TOKEN_INDEX
- sample.context = "" + "\n"
- if conv_format == 'llama3_sft':
- conversation = sample.context + sample.answers + conversation_lib.llama3_instruct.sep
- elif conv_format == "mistral":
- conversation = sample.context + sample.answers + conversation_lib.mistral_instruct.sep2
+ # LLAVA training: override text-prompt with just the image.
+ conv = [
+ # Note: no system message.
+ {"role": "user", "content": "\n"},
+ {"role": "assistant", "content": sample.answers},
+ ]
- input_ids = np.array(tokenizer_image_token(self.args, conversation, self.tokenizer, has_image=has_image))
- target = input_ids.copy()
+ input_ids, target = self.tokenizer.tokenize_conversation(conv, True, False)
- prompt_len = len(tokenizer_image_token(self.args, sample.context, self.tokenizer, has_image=has_image))
- target[:prompt_len] = IGNORE_INDEX
+ if self.is_packing_enabled:
+ input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
return ImageTaskSample(
__key__=sample.__key__,
+ __restore_key__=sample.__restore_key__,
+ __subflavor__=None,
__subflavors__=sample.__subflavors__,
imgs=imgs,
num_tiles=num_tiles,
- text=input_ids,
- prompt_len=prompt_len,
- target=target,
+ tokens=torch.tensor(input_ids),
+ labels=torch.tensor(target),
+ total_len=self._get_total_seq_length(input_ids, num_tiles),
)
- # Based on https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/train/train.py#L500
def encode_llava_sft(self, sample: SimilarityInterleavedSample):
+ """Encode SFT sample."""
augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
- use_chat_format = sample.__subflavors__['use_chat_format'] if 'use_chat_format' in sample.__subflavors__ else False
- has_image = sample.__subflavors__['has_image'] if 'has_image' in sample.__subflavors__ else False
has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
- has_visual_data = has_image or has_video
- conv_format = sample.__subflavors__['conv_format'] if 'conv_format' in sample.__subflavors__ else "mistral"
+ has_image = sample.__subflavors__['has_image'] if 'has_image' in sample.__subflavors__ else False
+ has_image = has_image or (hasattr(sample, "images") and len(sample.images) > 0)
- if has_image:
- imgs = get_visual_transform(
- sample.images[0], self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
- )
- num_tiles = [len(imgs)]
- elif has_video:
+ if has_video:
# Grab the selected frames of the video as a tensor with shape
# fhwc: (num_frames, height, width, num_channels).
video_fhwc = sample.images[0].permute(0, 2, 3, 1)
@@ -396,132 +322,65 @@ def encode_llava_sft(self, sample: SimilarityInterleavedSample):
imgs += get_visual_transform(
video_frame_hwc, self.img_h, self.img_w,
self.args.use_tiling, self.args.max_num_tiles,
- self.args.use_thumbnail, augment=False)
+ self.args.use_thumbnail, augment, self.args.vision_model_type)
+ num_tiles = [len(imgs)]
+ elif has_image:
+ imgs = get_visual_transform(
+ sample.images[0], self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+ self.args.vision_model_type,
+ )
num_tiles = [len(imgs)]
else:
imgs = num_tiles = []
sample.__key__ = "{}-{}".format("no-image", sample.__key__)
- if conv_format == 'llama3_sft':
- conv = conversation_lib.llama3_instruct.copy()
- elif conv_format == "mistral":
- conv = conversation_lib.mistral_instruct.copy()
-
- roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
-
- if use_chat_format:
- source = sample.texts
- if roles[source[0]["from"]] != conv.roles[0]:
- # Skip the first one if it is not from human
- source = source[1:]
-
- conv.messages = []
- for j, sentence in enumerate(source):
- role = roles[sentence["from"]]
- assert role == conv.roles[j % 2], sentence
- conv.append_message(role, sentence["value"])
- conversation = conv.get_prompt()
-
- ### Tokenize conversations
- input_ids = tokenizer_image_token(self.args, conversation, self.tokenizer, has_visual_data)
-
- input_ids = torch.LongTensor(input_ids)
- target = input_ids.clone()
-
- if conv.sep_style == conversation_lib.SeparatorStyle.MPT:
- # Mask targets
- sep = conv.sep + conv.roles[1]
-
- total_len = int((target != self.tokenizer.eod_token).sum())
-
- rounds = conversation.split(conv.sep)
- re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
- for conv_idx in range(3, len(rounds), 2):
- re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2])) # user + gpt
-
- cur_len = 0
- target[:cur_len] = IGNORE_INDEX
-
- for i, rou in enumerate(re_rounds):
- if rou == "":
- break
-
- rou += conv.sep
-
- parts = rou.split(sep)
-
- if len(parts) != 2:
- break
- parts[0] += sep
-
- round_len = len(tokenizer_image_token(self.args, rou, self.tokenizer, has_visual_data))
- instruction_len = len(tokenizer_image_token(self.args, parts[0], self.tokenizer, has_visual_data))
-
- if conv_format == 'llama3_sft' and i > 0:
- round_len -= 1
- instruction_len -= 1
-
- target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-
- cur_len += round_len
-
- target[cur_len:] = IGNORE_INDEX
-
- elif conv.sep_style == conversation_lib.SeparatorStyle.TWO:
- ### Mask targets
- sep = conv.sep + conv.roles[1] + ": "
-
- total_len = int((target != self.tokenizer.eod_token).sum())
-
- rounds = conversation.split(conv.sep2)
-
- cur_len = 0
-
- for i, rou in enumerate(rounds):
- if rou == "":
- break
-
- rou += conv.sep2 # put back conv.sep2 since we will lose it while we conversation.split above with conv.sep2
-
- parts = rou.split(sep)
+ conversation = []
+ # Note: Some tokenizers may ignore the system prompt.
+ conversation.append({"role": "system", "content": "Answer the questions."})
- if len(parts) != 2:
- break
- parts[0] += sep
+ has_image_token = False
- round_len = len(tokenizer_image_token(self.args, rou, self.tokenizer, has_visual_data))
- instruction_len = len(tokenizer_image_token(self.args, parts[0], self.tokenizer, has_visual_data)) - 2
+ for text in sample.texts:
+ if IMAGE_TOKEN in text["value"]:
+ has_image_token = True
- target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-
- cur_len += round_len
+ if text["from"] == "human":
+ role = "user"
+ elif text["from"] == "gpt":
+ role = "assistant"
+ else:
+ raise RuntimeError(f"unexpected role {text['from']} in {sample.texts}")
- target[cur_len:] = IGNORE_INDEX
+ turn = {"role": role, "content": text["value"]}
+ conversation.append(turn)
- elif conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
- raise NotImplementedError("this tokenizer is not supported yet with this data type")
+ # If the sample contains an image but none of the user messages has an image token,
+ # then add it to the first user message.
+ if len(imgs) > 0 and not has_image_token:
+ for turn in conversation:
+ if turn["role"] == "user":
+ turn["content"] = f"{IMAGE_TOKEN}\n" + turn["content"]
+ break
- if cur_len != total_len:
- target[:] = IGNORE_INDEX
+ input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
- raise Exception(
- f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}. Something is wrong, please fix!"
- )
-
- else:
- return NotImplementedError
+ if self.is_packing_enabled:
+ input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
return ImageTaskSample(
__key__=sample.__key__,
+ __restore_key__=sample.__restore_key__,
+ __subflavor__=None,
__subflavors__=sample.__subflavors__,
imgs=imgs,
num_tiles=num_tiles,
- text=input_ids,
- prompt_len=instruction_len,
- target=target,
+ tokens=torch.tensor(input_ids),
+ labels=torch.tensor(target),
+ total_len=self._get_total_seq_length(input_ids, num_tiles),
)
- def encode_vqa(self, sample: VQASample):
+ def encode_any_single_turn_vqa(self, sample):
+ """Encode MultiChoiceVQA or VQA sample."""
augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
@@ -537,104 +396,199 @@ def encode_vqa(self, sample: VQASample):
imgs += get_visual_transform(
video_frame_hwc, self.img_h, self.img_w,
self.args.use_tiling, self.args.max_num_tiles,
- self.args.use_thumbnail, augment=False)
+ self.args.use_thumbnail, augment, self.args.vision_model_type)
else:
imgs = get_visual_transform(
- sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+ sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+ self.args.use_thumbnail, augment, self.args.vision_model_type,
)
+
num_tiles = [len(imgs)]
- has_image = True
- if "" not in sample.context:
- sample.context = "" + sample.context
+ if isinstance(sample, MultiChoiceVQASample):
+ cur_prompt = format_multichoice_question(sample.context, sample.choices)
+ if "" not in cur_prompt:
+ cur_prompt = "\n" + cur_prompt
+ cur_answer = format_multichoice_answer(sample.correct_choice_idx)
+ elif isinstance(sample, VQASample):
+ if 'docvqa' in sample.__key__:
+ prompt_list = self.manual_prompts["VQASFT"]["docvqa"]
+ elif sample.__subflavors__.get("VQASFT"):
+ prompt_list = self.manual_prompts["VQASFT"]["raw"]
+ else:
+ prompt_list = ["{}"]
+
+ prompt_idx = np.random.randint(len(prompt_list))
+ cur_prompt = prompt_list[prompt_idx]
- if sample.context[-1:] != "\n":
- sample.context = sample.context + "\n"
+ cur_prompt = cur_prompt.format(sample.context)
- if isinstance(sample.answers, list):
- answer_list = sample.answers
- weight_list = np.array(sample.answer_weights).astype(np.float32)
- weight_list = weight_list / np.sum(weight_list)
- answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
- answer = answer_list[answer_idx]
+ if "" not in cur_prompt:
+ cur_prompt = "\n" + cur_prompt
+
+ if isinstance(sample.answers, list):
+ answer_list = sample.answers
+ weight_list = np.array(sample.answer_weights).astype(np.float32)
+ weight_list = weight_list / np.sum(weight_list)
+ answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
+ cur_answer = answer_list[answer_idx]
+ else:
+ cur_answer = sample.answers
else:
- answer = sample.answers
+ raise NotImplementedError("Unsupported data type provided", sample)
- conversation = sample.context + answer
- text = np.array(tokenizer_image_token(self.args, conversation, self.tokenizer, has_image=has_image))
+ conversation = [
+ {"role": "system", "content": "Answer the questions."},
+ {"role": "user", "content": cur_prompt},
+ {"role": "assistant", "content": str(cur_answer)},
+ ]
- prompt_len = len(tokenizer_image_token(self.args, sample.context, self.tokenizer, has_image=has_image))
+ input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
- target = text.copy()
- target[:prompt_len] = IGNORE_INDEX
+ if self.is_packing_enabled:
+ input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
return ImageTaskSample(
__key__=sample.__key__,
+ __restore_key__=sample.__restore_key__,
+ __subflavor__=None,
__subflavors__=sample.__subflavors__,
imgs=imgs,
num_tiles=num_tiles,
- text=text,
- prompt_len=prompt_len,
- target=target,
+ tokens=torch.tensor(input_ids),
+ labels=torch.tensor(target),
+ total_len=self._get_total_seq_length(input_ids, num_tiles),
)
- def encode_ocr(self, sample: OCRSample) -> ImageTaskSample:
- if sample.__subflavors__["type"] == "document":
- visual_transform = self.ocr_document_visual_transform
- elif sample.__subflavors__["type"] == "paragraph":
- visual_transform = self.ocr_paragraph_visual_transform
- elif sample.__subflavors__["augmentation"] == False:
- visual_transform = self.ocr_document_identity_transform
- else:
- raise ValueError(f"Unknown subflavor {sample.__subflavors__}")
-
- if sample.words_boxes is not None and sample.words_boxes.shape[1] >= 5:
- # Boxes with conf below 0.9 are skipped
- filter_words_mask = sample.words_boxes[:, 4] < 0.9
- filter_boxes = sample.words_boxes[filter_words_mask, :4]
- for x, y, x2, y2 in filter_boxes:
- if isinstance(sample.image, Image.Image):
- draw = ImageDraw.Draw(sample.image)
- draw.rectangle([int(x), int(y), (int(x2), int(y2))], fill=0)
- else:
- sample.image[:, int(y) : int(y2) + 1, int(x) : int(x2) + 1] = 0
-
- text = " ".join(
- text for skip, text in zip(filter_words_mask, sample.words_text) if not skip
- )
- else:
- text = " ".join(sample.text.splitlines())
+ def combined_ocr_encoder(self, sample, task_type):
+ """Encode OCR samples."""
+ augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
- match = re.search(r'"text_sequence": "(.*?)"', text)
- if match:
- text = match.group(1)
+ if task_type == "encode_pdf":
+ sample, cur_prompt, cur_answer = self.encode_pdf_prompt(sample)
+ elif task_type == "encode_ocr_ref":
+ sample, cur_prompt, cur_answer = self.encode_ocr_ref_prompt(sample)
+ elif task_type == "_encode_ocr":
+ sample, cur_prompt, cur_answer = self.encode_ocr_prompt(sample)
- img = visual_transform(sample.image)
- img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - self.pixel_mean) / self.pixel_std
- img = torch.nn.functional.pad(img, (0, self.img_w - img.shape[2], 0, self.img_h - img.shape[1]))
+ imgs = get_visual_transform(
+ sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles,
+ self.args.use_thumbnail, augment, self.args.vision_model_type,
+ )
+ num_tiles = [len(imgs)]
- # randomly select a prompt
- prompt_idx = np.random.randint(len(self.manual_prompts["OCR"]["raw"]))
- cur_prompt = self.manual_prompts["OCR"]["raw"][prompt_idx]
+ conversation = [
+ {"role": "system", "content": "Answer the questions."},
+ {"role": "user", "content": cur_prompt},
+ {"role": "assistant", "content": str(cur_answer)},
+ ]
- if cur_prompt not in self.txt_to_token_dict:
- self.txt_to_token_dict[cur_prompt] = self.tokenizer(cur_prompt)
- cur_prompt = self.txt_to_token_dict[cur_prompt]
+ input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
- text_sample = self.tokenizer(text)
- prompt_len = len(cur_prompt)
- text_sample = np.concatenate([cur_prompt, text_sample])
+ if self.is_packing_enabled:
+ input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
return ImageTaskSample(
__key__=sample.__key__,
+ __restore_key__=sample.__restore_key__,
+ __subflavor__=None,
__subflavors__=sample.__subflavors__,
- imgs=[img],
- num_tiles=[1],
- text=text_sample,
- prompt_len=prompt_len
+ imgs=imgs,
+ num_tiles=num_tiles,
+ tokens=torch.tensor(input_ids),
+ labels=torch.tensor(target),
+ total_len=self._get_total_seq_length(input_ids, num_tiles),
)
- def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch:
+ def encode_pdf_prompt(self, sample: OCRSample) -> ImageTaskSample:
+ """Encode OCR sample."""
+ prompt_list = self.manual_prompts["DocPretraining"]["raw"]
+ prompt_idx = np.random.randint(len(prompt_list))
+ cur_prompt = prompt_list[prompt_idx]
+ if "" not in cur_prompt:
+ cur_prompt = "\n" + cur_prompt
+
+ # Make sure there is no extra tag.
+ sample.text = sample.text.replace("", "")
+
+ caption = sample.text.strip()
+
+ split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+ if split_by_line_flag:
+ caption_list = caption.split('\n')
+ caption = np.random.choice(caption_list)
+ cur_answer = caption
+
+ return sample, cur_prompt, cur_answer
+
+ def encode_ocr_ref_prompt(self, sample: OCRSample) -> ImageTaskSample:
+ """Encode OCR sample."""
+ ref = sample.text
+ region = sample.words_boxes
+
+ # Make sure there is no extra tag
+ ref = ref.replace("", "")
+
+ if len(region) == 4:
+ region = f"({region[0]},{region[1]}),({region[2]},{region[3]})"
+ else:
+ region = f"({region[0]},{region[1]}),({region[2]},{region[3]}),({region[4]},{region[5]}),({region[6]},{region[7]})"
+
+ # Randomly choose between two tasks
+ task_idx = np.random.randint(2)
+ if task_idx == 0:
+ # Referring Grounding
+ prompt_list = self.manual_prompts["DocPretraining"]["referring_grounding"]
+ prompt_content = ref
+ answer = region
+ else:
+ # Grounded OCR
+ prompt_list = self.manual_prompts["DocPretraining"]["grounded_ocr"]
+ prompt_content = region
+ answer = ref
+
+ prompt_idx = np.random.randint(len(prompt_list))
+ cur_prompt = prompt_list[prompt_idx]
+ cur_prompt = cur_prompt.format(prompt_content)
+ if "" not in cur_prompt:
+ cur_prompt = "\n" + cur_prompt
+
+ return sample, cur_prompt, answer
+
+ def bbox_coord_to_label(self, text, bbox):
+ """Format bbox coordinates as text."""
+ assert len(bbox) == 4 or len(bbox) == 8
+
+ # Make sure there is no extra tag
+ text = text.replace("", "")
+
+ if len(bbox) == 4:
+ label_str = f"[{text}]({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})"
+ else:
+ label_str = f"[{text}]({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]}),({bbox[4]},{bbox[5]}),({bbox[6]},{bbox[7]})"
+
+ return label_str
+
+ def encode_ocr_prompt(self, sample: OCRSample) -> ImageTaskSample:
+ """Encode OCR sample."""
+ if isinstance(sample.words_boxes[0], int):
+ answer = self.bbox_coord_to_label(sample.text, sample.words_boxes)
+ elif isinstance(sample.words_boxes[0], list):
+ answer = ""
+ for i, bbox in enumerate(sample.words_boxes):
+ answer += self.bbox_coord_to_label(sample.words_text[i], bbox)
+
+ prompt_list = self.manual_prompts["DocPretraining"]["ocr_multi"]
+ prompt_idx = np.random.randint(len(prompt_list))
+ cur_prompt = prompt_list[prompt_idx]
+
+ if "" not in cur_prompt:
+ cur_prompt = "\n" + cur_prompt
+ cur_answer = answer
+
+ return sample, cur_prompt, cur_answer
+
+ def batch(self, samples: List[Union[ImageTaskSample, ImageTaskSamplePacked]]) -> ImageTaskBatchPacked:
# Stack images to [num_tiles, c, h, w]. If there are no images (text-only), then use a dummy image.
imgs = [img for s in samples for img in s.imgs]
if len(imgs) > 0:
@@ -642,45 +596,128 @@ def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch:
else:
imgs = torch.tensor([[0]], dtype=torch.float32)
- # Put tile counts to a single tensor. If there are no images (text-only), then use a dummy tensor.
- num_tiles = torch.tensor([n for s in samples for n in s.num_tiles], dtype=torch.int)
- if len(num_tiles) == 0:
- num_tiles = torch.tensor([[0]], dtype=torch.int)
-
- # If the user hasn't defined a target sequence length, then use the max along the sample lengths.
- max_seq_len = self.seq_len
+ # If the user hasn't defined a target dataloader sequence length, then use the max along the sample lengths.
+ max_seq_len = self.dataloader_seq_length
if not max_seq_len:
- max_seq_len = max(len(s.text) for s in samples)
+ max_seq_len = max(len(s.tokens) for s in samples)
- text_mat = np.full((len(samples), max_seq_len), self.tokenizer.eod_token, dtype=np.int64)
+ tokens = np.full((len(samples), max_seq_len), self.tokenizer.pad, dtype=np.int64)
# +1 to accommodate shift to left by one later.
- target_mat = np.full((len(samples), max_seq_len + 1), self.tokenizer.eod_token, dtype=np.int64)
+ labels = np.full((len(samples), max_seq_len + 1), self.tokenizer.pad, dtype=np.int64)
for i, s in enumerate(samples):
# If the sample/target length exceeds the target sequence length, then truncate.
- text_len = min(max_seq_len, len(s.text))
- target_len = min(max_seq_len+1, len(s.target))
+ text_len = min(max_seq_len, len(s.tokens))
+ target_len = min(max_seq_len+1, len(s.labels))
- text_mat[i, :text_len] = np.array(s.text)[:text_len]
- target_mat[i, :target_len] = np.array(s.target)[:target_len]
+ tokens[i, :text_len] = s.tokens[:text_len]
+ labels[i, :target_len] = s.labels[:target_len]
- batch = ImageTaskBatch(
- __keys__=[s.__key__ for s in samples],
- __subflavors__=[s.__subflavors__ for s in samples],
+ num_tiles = torch.tensor([n for s in samples for n in s.num_tiles], dtype=torch.int32)
+ if len(num_tiles) == 0:
+ num_tiles = torch.tensor([[0]], dtype=torch.int32)
+
+ # Cumulative sample lengths are needed for packing, otherwise use dummy values.
+ cu_lengths = torch.tensor([[0]], dtype=torch.int32)
+ max_lengths = torch.tensor([[0]], dtype=torch.int32)
+
+ if self.is_packing_enabled:
+ cu_lengths = torch.stack([s.cu_lengths for s in samples])
+ max_lengths = torch.tensor([s.max_length for s in samples], dtype=torch.int32)
+
+ return ImageTaskBatchPacked(
+ __key__=[s.__key__ for s in samples],
+ __restore_key__=[s.__restore_key__ for s in samples],
+ __subflavor__=None,
+ __subflavors__=samples[0].__subflavors__,
+ tokens=tokens,
+ labels=labels,
imgs=imgs,
num_tiles=num_tiles,
- text=torch.from_numpy(text_mat),
- prompt_len=torch.from_numpy(np.array([s.prompt_len for s in samples], dtype=np.int64)),
- target=torch.from_numpy(target_mat),
+ cu_lengths=cu_lengths,
+ max_lengths=max_lengths,
)
- return batch
-
- def encode_batch(self, batch: ImageTaskBatch) -> dict:
+ def encode_batch(self, batch: ImageTaskBatchPacked) -> dict:
raw = dataclasses.asdict(batch)
del raw["__subflavors__"]
return raw
+ def select_samples_to_pack(self, samples: List[ImageTaskSample]) -> List[List[ImageTaskSample]]:
+ """Selects which samples will be packed together.
+
+ NOTE: Energon dataloader calls this method internally if packing is used.
+ Please see https://nvidia.github.io/Megatron-Energon/packing.html
+ """
+ lengths = [sample.total_len for sample in samples]
+
+ packed_samples = greedy_knapsack(lengths, samples, self.packing_seq_length)
+
+ return packed_samples
+
+ @stateless
+ def pack_selected_samples(self, samples: List[ImageTaskSample]) -> List[ImageTaskSamplePacked]:
+ """
+ Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked.
+
+ NOTE: Energon dataloader calls this method internally if packing is used.
+ Please see https://nvidia.github.io/Megatron-Energon/packing.html
+
+ Args:
+ samples: List of ImageTaskSample instances to pack into one sample.
+
+ Returns:
+ ImageTaskSamplePacked instance.
+ """
+ packing_seq_len = self.packing_seq_length
+
+ packed_tokens = []
+ packed_labels = []
+ packed_imgs = []
+
+ current_length = 0
+ max_length = 0
+ cu_lengths = [0]
+
+ # Process each sample and build lists that we will concatenate to create the packed sample.
+ for _, sample in enumerate(samples):
+ sample_len = sample.total_len
+
+ if sample_len > max_length:
+ max_length = sample_len
+
+ # If adding this sample exceeds the max length, stop.
+ # This should not happen. The select_samples_to_pack method should have already ensured that the samples fit.
+ if current_length + sample_len > packing_seq_len:
+ raise ValueError(f"Packed sample exceeds the maximum sequence length of {packing_seq_len}: {samples}")
+
+ # Add the sample's tokens and labels
+ packed_tokens.append(sample.tokens)
+ packed_labels.append(sample.labels)
+
+ # Add the images
+ packed_imgs += sample.imgs
+
+ current_length += sample_len
+ cu_lengths.append(current_length)
+
+ # Concatenate packed tokens and labels.
+ packed_tokens = torch.cat(packed_tokens, dim=0)
+ packed_labels = torch.cat(packed_labels, dim=0)
+
+ return ImageTaskSamplePacked(
+ __key__=",".join([s.__key__ for s in samples]),
+ __restore_key__=(), # Will be set by energon based on `samples`
+ __subflavor__=None,
+ __subflavors__=samples[0].__subflavors__,
+ tokens=packed_tokens,
+ labels=packed_labels,
+ imgs=packed_imgs,
+ cu_lengths=torch.tensor(cu_lengths, dtype=torch.int32),
+ max_length=max_length,
+ num_tiles=[n for s in samples for n in s.num_tiles],
+ )
+
def print_error_handler(exc: Exception, key: Optional[str]):
print(
@@ -689,35 +726,18 @@ def print_error_handler(exc: Exception, key: Optional[str]):
)
traceback.print_exc()
-# From https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/mm_utils.py#L185
-def tokenizer_image_token(args, prompt, tokenizer, has_image=True, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
-
- if not has_image:
- input_ids = tokenizer(prompt)
-
- else:
- prompt_chunks = [tokenizer(chunk) for chunk in prompt.split('')]
-
- def insert_separator(X, sep):
- return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
-
- input_ids = []
- offset = 0
- if args.tokenizer_type in ['Llama2Tokenizer', 'Llama3Tokenizer'] and len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0:
- offset = 1
- input_ids.append(prompt_chunks[0][0])
+def format_multichoice_question(question, multichoice_options):
+ """Format multi-choice question."""
+ options_text = ["{}. {}\n".format(chr(ord('A') + i), option) for i, option in
+ zip(range(len(multichoice_options)), multichoice_options)]
+ options_text = "".join(options_text)
- for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
- input_ids.extend(x[offset:])
+ options_text = f"{options_text}Answer with the option's letter from the given choices directly."
- if return_tensors is not None:
- if return_tensors == 'pt':
- return torch.tensor(input_ids, dtype=torch.long)
- raise ValueError(f'Unsupported tensor type: {return_tensors}')
+ return "{}\n{}".format(question, options_text)
- # # remove BOS token
- # if args.tokenizer_type in ['Llama2Tokenizer', 'Llama3Tokenizer']:
- # return input_ids[1:]
- return input_ids
+def format_multichoice_answer(idx):
+ """Format multi-choice answer."""
+ return chr(ord('A') + idx)
diff --git a/examples/multimodal/evaluate_ai2d.py b/examples/multimodal/evaluate_ai2d.py
new file mode 100644
index 0000000000..39b866ae4a
--- /dev/null
+++ b/examples/multimodal/evaluate_ai2d.py
@@ -0,0 +1,52 @@
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from evaluate_vqav2 import compute_vqa_accuracy
+
+
+def merge_input_files(input_path):
+ """Merge input files to a format compatible with the evaluator."""
+ input_file_paths, output_file_path = get_input_output_paths(input_path, task="AI2D")
+
+ results = dict()
+
+ for input_file_path in input_file_paths:
+ with open(input_file_path, "r") as input_file:
+ for line in input_file:
+ res = json.loads(line)
+ sample_id = res["sample_id"]
+
+ # Ignore possible duplicates.
+ if sample_id in results:
+ continue
+
+ results[sample_id] = {
+ "question_id": sample_id,
+ "answer": res["answer"],
+ "gt_answer": res["gt_answer"],
+ }
+
+ results = list(results.values())
+
+ with open(output_file_path, "w") as output_file:
+ json.dump(results, output_file)
+
+ return output_file_path
+
+
+def ai2d_eval(input_path):
+ """Run AI2D evaluation."""
+ result_file_path = merge_input_files(input_path)
+ avg_acc = compute_vqa_accuracy(result_file_path, task="AI2D")
+ return avg_acc
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+ args = parser.parse_args()
+
+ avg_acc = ai2d_eval(args.input_path)
+
+ print(f"===== AI2D Accuracy {avg_acc:.2f}% =====")
diff --git a/examples/multimodal/evaluate_chartqa.py b/examples/multimodal/evaluate_chartqa.py
index 8ec346d0d1..53d4944f46 100644
--- a/examples/multimodal/evaluate_chartqa.py
+++ b/examples/multimodal/evaluate_chartqa.py
@@ -9,15 +9,22 @@ def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA")
- results = []
+ results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
- res["question_id"] = res["sample_id"]
+ sample_id = res["sample_id"]
- results.append(res)
+ # Ignore possible duplicates.
+ if sample_id in results:
+ continue
+
+ res["question_id"] = sample_id
+ results[sample_id] = res
+
+ results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
@@ -28,7 +35,7 @@ def merge_input_files(input_path):
def chartqa_eval(input_path):
"""Run ChartQA evaluation."""
result_file_path = merge_input_files(input_path)
- return compute_vqa_accuracy(result_file_path, use_chartqa_metric=True)
+ return compute_vqa_accuracy(result_file_path, task="ChartQA")
if __name__ == "__main__":
diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluate_coco.py
index a717090c92..8eeb367e8f 100644
--- a/examples/multimodal/evaluate_coco.py
+++ b/examples/multimodal/evaluate_coco.py
@@ -11,20 +11,28 @@ def convert_to_coco_format(input_path):
"""Convert input files to COCO compatible format."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning")
- captions = []
+ results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
+ sample_id = res["sample_id"]
- question_id = res['sample_id']
- caption = res['caption'].rstrip('.').lower()
+ # Ignore possible duplicates.
+ if sample_id in results:
+ continue
- captions.append({"image_id": question_id, "caption": caption})
+ caption = res["caption"].rstrip(".").lower()
+ results[sample_id] = {
+ "image_id": sample_id,
+ "caption": caption,
+ }
+
+ results = list(results.values())
with open(output_file_path, "w") as output_file:
- json.dump(captions, output_file, indent=4)
+ json.dump(results, output_file, indent=4)
return output_file_path
diff --git a/examples/multimodal/evaluate_mathvista.py b/examples/multimodal/evaluate_mathvista.py
new file mode 100644
index 0000000000..a55f312f21
--- /dev/null
+++ b/examples/multimodal/evaluate_mathvista.py
@@ -0,0 +1,122 @@
+import argparse
+import json
+import re
+
+from evaluate_mmmu import get_input_output_paths
+from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response
+from open_flamingo.eval.vqa_metric import VQAEval
+
+
+def merge_input_files(input_path):
+ """Merge input files to a format compatible with the evaluator."""
+ input_file_paths, output_file_path = get_input_output_paths(input_path, task="MathVista")
+
+ results = dict()
+
+ for input_file_path in input_file_paths:
+ with open(input_file_path, "r") as input_file:
+ for line in input_file:
+ res = json.loads(line)
+ sample_id = res["sample_id"]
+
+ # Remove possible duplicates.
+ if sample_id in results:
+ continue
+
+ results[sample_id] = res
+
+ results = list(results.values())
+
+ with open(output_file_path, "w") as output_file:
+ json.dump(results, output_file)
+
+ return output_file_path
+
+
+def extra_processing(text):
+ """Extra processing."""
+ # Max decimal point capped to 2 decimal point
+ regex = re.compile(r'^\d+\.\d+$')
+ decimal = regex.findall(text)
+
+ if len(decimal) > 0:
+ non_decimal = len(decimal[0].split(".")[0])
+
+ # if decimal values are all 0, trim them
+ decimal_digits = [int(d) for d in decimal[0].split(".")[1]]
+ if sum(decimal_digits) == 0:
+ text = decimal[0][:non_decimal]
+ else:
+ text = decimal[0][: non_decimal + 3]
+
+ # remove % and trailing .
+ text = text.replace("%", "")
+ if text[-1] == ".":
+ text = text[:-1]
+
+ return text
+
+
+def extract_answer(text):
+ """Extract answer."""
+ alphabet = re.findall(r'[a-zA-Z]+', text)
+ if len(alphabet) > 0 and "e+" not in text:
+ template = re.findall(r'answer is -*\d+\.*\d*', text)
+ if len(template) > 0:
+ text = template[0]
+
+ numbers = re.findall(r'-*\d+\.*\d*', text)
+ text = numbers[0] if len(numbers) > 0 else text
+
+ return text
+
+
+def compute_mathvista_accuracy(result_file):
+ """Compute MathVista accuracy."""
+ merged_results = json.load(open(result_file))
+
+ vqa = VQAEval(vqa=None, vqaRes=None)
+ acc = 0
+ for res in merged_results:
+ pred_ans = res["answer"]
+ if res["question_type"] == "multi_choice":
+ pred_ans = parse_multi_choice_response(pred_ans, res["all_choices"], res["index2ans"])
+ else:
+ pred_ans = vqa.processPunctuation(pred_ans)
+ pred_ans = vqa.processDigitArticle(pred_ans)
+ # Extra processing and extraction.
+ pred_ans = extra_processing(pred_ans)
+ pred_ans = extract_answer(pred_ans)
+
+ gt_ans = res["gt_answer"]
+ if isinstance(gt_ans, list):
+ assert len(gt_ans) == 1, f"Expected 1 groundtruth, got {gt_ans}"
+ gt_ans = gt_ans[0]
+
+ if res["question_type"] != "multi_choice":
+ gt_ans = vqa.processPunctuation(gt_ans)
+ gt_ans = vqa.processDigitArticle(gt_ans)
+
+ gt_ans = extra_processing(gt_ans)
+
+ if pred_ans == gt_ans:
+ acc += 1
+ acc = acc / len(merged_results) * 100
+ return acc
+
+
+def mathvista_eval(input_path):
+ """Run MathVista evaluation."""
+ result_file_path = merge_input_files(input_path)
+ acc = compute_mathvista_accuracy(result_file_path)
+ return acc
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+ args = parser.parse_args()
+
+ acc = mathvista_eval(args.input_path)
+
+ print(f"===== MathVista accuracy: {acc} =====")
diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluate_mmmu.py
index 955be95842..22c3921f25 100644
--- a/examples/multimodal/evaluate_mmmu.py
+++ b/examples/multimodal/evaluate_mmmu.py
@@ -40,6 +40,18 @@ def convert_to_mmmu_format(input_path):
sample_id = res["sample_id"]
prediction = res["prediction"]
+ if res["question_type"] == "multiple-choice":
+ from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response
+
+ prediction = parse_multi_choice_response(
+ prediction, res["all_choices"], res["index2ans"]
+ )
+
+ # MMMU eval script expects just a sample_id to prediction mapping.
+ # Skip possible duplicates.
+ if sample_id in output:
+ continue
+
output[sample_id] = prediction
with open(output_file_path, "w") as output_file:
@@ -69,7 +81,7 @@ def mmmu_eval(input_path, groundtruth_path):
print(output.stderr)
print(output.stdout)
- m = re.search("'Overall': {'num': \d, 'acc': (\d.\d+)}", output.stdout)
+ m = re.search("'Overall': {'num': \d+, 'acc': (\d.\d+)}", output.stdout)
return float(m.group(1)) * 100.0
diff --git a/examples/multimodal/evaluate_ocrbench.py b/examples/multimodal/evaluate_ocrbench.py
new file mode 100644
index 0000000000..b37473a67d
--- /dev/null
+++ b/examples/multimodal/evaluate_ocrbench.py
@@ -0,0 +1,137 @@
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+
+
+def merge_input_files(input_path):
+ """Merge input files to a format compatible with the evaluator."""
+ input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
+
+ results = dict()
+
+ for input_file_path in input_file_paths:
+ with open(input_file_path, "r") as input_file:
+ for line in input_file:
+ res = json.loads(line)
+ sample_id = res["sample_id"]
+
+ # Remove possible duplicates.
+ if sample_id in results:
+ continue
+
+ results[sample_id] = res
+
+ results = list(results.values())
+
+ with open(output_file_path, "w") as output_file:
+ json.dump(results, output_file)
+
+ return output_file_path
+
+
+def compute_ocrbench_score(result_file):
+ """Compute OCRBench score."""
+ merged_results = json.load(open(result_file))
+
+ # OCRBench score calculation is adopted from https://github.com/Yuliang-Liu/MultimodalOCR/blob/1b7713f44c91f30f64efb6d3e494c416861ef15f/example.py#L1
+ # MIT License. Copyright (c) 2023 Yuliang Liu
+ score = {
+ "Regular Text Recognition": 0,
+ "Irregular Text Recognition": 0,
+ "Artistic Text Recognition": 0,
+ "Handwriting Recognition": 0,
+ "Digit String Recognition": 0,
+ "Non-Semantic Text Recognition": 0,
+ "Scene Text-centric VQA": 0,
+ "Doc-oriented VQA": 0,
+ "Doc-oriented VQA": 0,
+ "Key Information Extraction": 0,
+ "Handwritten Mathematical Expression Recognition": 0,
+ }
+
+ for res in merged_results:
+ predict = res["answer"]
+ answers = res["gt_answer"]
+
+ dataset_name = res["dataset_name"]
+ ocr_type = res["data_type"]
+
+ if dataset_name == "HME100k":
+ if isinstance(answers, list):
+ for j in range(len(answers)):
+ answer = answers[j].strip().replace("\n", " ").replace(" ", "")
+ predict = predict.strip().replace("\n", " ").replace(" ", "")
+ if answer in predict:
+ score[ocr_type] += 1
+ else:
+ answers = answers.strip().replace("\n", " ").replace(" ", "")
+ predict = predict.strip().replace("\n", " ").replace(" ", "")
+ if answers in predict:
+ score[ocr_type] += 1
+ else:
+ if isinstance(answers, list):
+ for j in range(len(answers)):
+ answer = answers[j].lower().strip().replace("\n", " ")
+ predict = predict.lower().strip().replace("\n", " ")
+ if answer in predict:
+ score[ocr_type] += 1
+ else:
+ answers = answers.lower().strip().replace("\n", " ")
+ predict = predict.lower().strip().replace("\n", " ")
+ if answers in predict:
+ score[ocr_type] += 1
+
+ recognition_score = (
+ score['Regular Text Recognition']
+ + score['Irregular Text Recognition']
+ + score['Artistic Text Recognition']
+ + score['Handwriting Recognition']
+ + score['Digit String Recognition']
+ + score['Non-Semantic Text Recognition']
+ )
+ final_score = (
+ recognition_score
+ + score['Scene Text-centric VQA']
+ + score['Doc-oriented VQA']
+ + score['Key Information Extraction']
+ + score['Handwritten Mathematical Expression Recognition']
+ )
+ result_log = f"""###########################OCRBench##############################
+Text Recognition(Total 300): {recognition_score}
+------------------Details of Recognition Score-------------------
+Regular Text Recognition(Total 50): {score['Regular Text Recognition']}
+Irregular Text Recognition(Total 50): {score['Irregular Text Recognition']}
+Artistic Text Recognition(Total 50): {score['Artistic Text Recognition']}
+Handwriting Recognition(Total 50): {score['Handwriting Recognition']}
+Digit String Recognition(Total 50): {score['Digit String Recognition']}
+Non-Semantic Text Recognition(Total 50): {score['Non-Semantic Text Recognition']}
+----------------------------------------------------------------
+Scene Text-centric VQA(Total 200): {score['Scene Text-centric VQA']}
+----------------------------------------------------------------
+Doc-oriented VQA(Total 200): {score['Doc-oriented VQA']}
+----------------------------------------------------------------
+Key Information Extraction(Total 200): {score['Key Information Extraction']}
+----------------------------------------------------------------
+Handwritten Mathematical Expression Recognition(Total 100): {score['Handwritten Mathematical Expression Recognition']}
+----------------------Final Score-------------------------------
+Final Score(Total 1000): {final_score}"""
+
+ return result_log, final_score
+
+
+def ocrbench_eval(input_path):
+ """Run OCRBench evaluation."""
+ result_file_path = merge_input_files(input_path)
+ result_log, score = compute_ocrbench_score(result_file_path)
+ return result_log, score
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+ args = parser.parse_args()
+
+ result_log, _ = ocrbench_eval(args.input_path)
+
+ print(result_log)
diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py
index e231b8e2c2..af782bdf03 100644
--- a/examples/multimodal/evaluate_textvqa.py
+++ b/examples/multimodal/evaluate_textvqa.py
@@ -9,22 +9,25 @@ def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
- results = []
+ results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
- results.append(
- {
- "question_id": res["sample_id"],
- "answer": res["answer"],
- "gt_answer": res["gt_answer"],
- }
- )
+ sample_id = res["sample_id"]
- # Make order deterministic.
- # results = sorted(results, key=lambda d: d["question_id"])
+ # Remove possible duplicates.
+ if sample_id in results:
+ continue
+
+ results[sample_id] = {
+ "question_id": sample_id,
+ "answer": res["answer"],
+ "gt_answer": res["gt_answer"],
+ }
+
+ results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
@@ -35,7 +38,7 @@ def merge_input_files(input_path):
def textvqa_eval(input_path):
"""Run TextVQA evaluation."""
result_file_path = merge_input_files(input_path)
- avg_acc = compute_vqa_accuracy(result_file_path)
+ avg_acc = compute_vqa_accuracy(result_file_path, task="TextVQA")
return avg_acc
diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py
index 9e3b727501..7807d80723 100644
--- a/examples/multimodal/evaluate_vqav2.py
+++ b/examples/multimodal/evaluate_vqav2.py
@@ -9,15 +9,22 @@ def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
- results = []
+ results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
- res["question_id"] = res["sample_id"]
+ sample_id = res["sample_id"]
- results.append(res)
+ # Skip possible duplicates.
+ if sample_id in results:
+ continue
+
+ res["question_id"] = sample_id
+ results[sample_id] = res
+
+ results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
@@ -34,7 +41,7 @@ def is_number(n: str):
return False
-def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
+def compute_vqa_accuracy(result_file, task):
"""Compute VQA accuracy."""
merged_results = json.load(open(result_file))
@@ -51,12 +58,15 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
# ChartQA uses relaxed accuracy:
# "We consider an answer to be correct if it is within 5% of the gold answer.
- # For non-numeric answers, we still need an exact match to consider an answer to be correct."
- if use_chartqa_metric:
+ # For non-numeric answers, we still need an exact match to consider an answer to be correct."
+ if task == "ChartQA":
acc = 0.0
assert len(gt) == 1, "expected exactly one groundtruth answer."
gt = gt[0]
+ pred = pred.rstrip("%")
+ gt = gt.rstrip("%")
+
if is_number(pred) and is_number(gt):
pred = float(pred)
gt = float(gt)
@@ -66,10 +76,16 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
acc = 1.0
all_acc.append(acc)
- else:
+ elif task in ("VQAv2", "TextVQA"):
num_match = sum([pred == ans for ans in gt])
acc = min(1.0, num_match / 3.0)
all_acc.append(acc)
+ elif task == "AI2D":
+ assert len(gt) == 1, f"Expected exactly 1 GT, got {gt}"
+ acc = pred == gt[0]
+ all_acc.append(acc)
+ else:
+ raise NotImplementedError(f"unknown task {task}")
acc_avg = sum(all_acc) / len(all_acc) * 100
@@ -79,7 +95,7 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
def vqav2_eval(input_path):
"""Run VQAv2 evaluation."""
result_file = merge_input_files(input_path)
- avg_acc = compute_vqa_accuracy(result_file)
+ avg_acc = compute_vqa_accuracy(result_file, task="VQAv2")
return avg_acc
diff --git a/examples/multimodal/evaluation_datasets.py b/examples/multimodal/evaluation_datasets.py
new file mode 100644
index 0000000000..50a50d5687
--- /dev/null
+++ b/examples/multimodal/evaluation_datasets.py
@@ -0,0 +1,920 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+"""Evaluation datasets."""
+import glob
+import itertools
+import json
+import os
+import re
+from collections import defaultdict
+
+import numpy as np
+import torch
+from image_processing import get_visual_transform
+from PIL import Image
+
+from megatron.training import print_rank_0
+
+
+def _get_partition_bounds(
+ total_num_samples, num_samples_per_partition, num_partitions, partition_id
+):
+ if num_samples_per_partition == 0:
+ samples_per_partition = [
+ int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
+ ]
+ return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
+ return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
+
+
+class VQADataset(torch.utils.data.Dataset):
+ """VQA evaluation dataset."""
+
+ def __init__(
+ self,
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ keys,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ ):
+ samples = json.load(open(gt_path, encoding='utf-8'))
+ if "data" in samples:
+ samples = samples["data"]
+
+ # Optionally, process only a subset of the input files.
+ if num_partitions > 0:
+ lb, ub = _get_partition_bounds(
+ len(samples), num_samples_per_partition, num_partitions, partition_id
+ )
+ samples = samples[lb:ub]
+
+ self._keys = keys
+ self._samples = samples
+ self._input_image_path = input_image_path
+ self._img_h = img_h
+ self._img_w = img_w
+ self._use_tiling = use_tiling
+ self._max_num_tiles = max_num_tiles
+ self._use_thumbnail = use_thumbnail
+ self._vision_model_type = vision_model_type
+
+ def __len__(self):
+ return len(self._samples)
+
+ def __getitem__(self, idx):
+ sample = self._samples[idx]
+
+ img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
+ if not os.path.exists(img_file):
+ img_file += ".jpg"
+
+ if not os.path.exists(img_file):
+ img_file = img_file.replace('.jpg', '.png')
+
+ img = Image.open(img_file)
+ imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ self._max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ )
+ tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+ sample_id = idx
+ if "sample_id" in self._keys:
+ sample_id = sample[self._keys["sample_id"]]
+
+ metadata = "" # Not used.
+
+ return (
+ torch.stack(imgs),
+ tile_count,
+ sample_id,
+ sample[self._keys["question"]],
+ sample[self._keys["answer"]],
+ metadata,
+ )
+
+
+class CaptioningDataset(torch.utils.data.Dataset):
+ """Captioning evaluation dataset."""
+
+ def __init__(
+ self,
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ ):
+ image_files = sorted(glob.glob(input_image_path + "/*"))
+
+ # Optionally, process only a subset of the input files.
+ if num_partitions > 0:
+ lb, ub = _get_partition_bounds(
+ len(image_files), num_samples_per_partition, num_partitions, partition_id
+ )
+ image_files = image_files[lb:ub]
+
+ gts = json.load(open(gt_path))
+ answers = defaultdict(list)
+ for gt in gts["annotations"]:
+ answers[gt["image_id"]].append(gt['caption'])
+
+ self._image_files = image_files
+ self._answers = answers
+ self._img_h = img_h
+ self._img_w = img_w
+ self._use_tiling = use_tiling
+ self._max_num_tiles = max_num_tiles
+ self._use_thumbnail = use_thumbnail
+ self._vision_model_type = vision_model_type
+
+ def __len__(self):
+ return len(self._image_files)
+
+ def __getitem__(self, idx):
+ img_file = self._image_files[idx]
+ image_id = int(img_file.split("_")[-1].split(".")[0])
+
+ img = Image.open(img_file)
+ imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ self._max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ )
+
+ tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+ question = "" # Fixed for all samples.
+ metadata = "" # Not used.
+
+ return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
+
+
+class MMMUDataset(torch.utils.data.Dataset):
+ """MMMU evaluation dataset."""
+
+ def __init__(
+ self,
+ input_image_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ prompt_style,
+ vision_model_type,
+ ):
+ import datasets
+ from MMMU.mmmu.utils.data_utils import CAT_SHORT2LONG, load_yaml
+
+ # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
+ all_mmmu_datasets = []
+
+ hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+ assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+ for subject in CAT_SHORT2LONG.values():
+ # Use a local copy of the dataset if exists (can be faster) or the HF one.
+ if os.path.exists(input_image_path):
+ subject_dataset = datasets.load_dataset(
+ os.path.join(input_image_path, subject),
+ split=datasets.Split.VALIDATION,
+ cache_dir=hf_datasets_cache,
+ verification_mode="no_checks",
+ )
+ else:
+ subject_dataset = datasets.load_dataset(
+ "MMMU/MMMU",
+ subject,
+ split=datasets.Split.VALIDATION,
+ cache_dir=hf_datasets_cache,
+ )
+
+ all_mmmu_datasets.append(subject_dataset)
+
+ dataset = datasets.concatenate_datasets(all_mmmu_datasets)
+
+ dataset = [s for s in dataset if s['id'].startswith("val")]
+
+ # Optionally, process only a subset of the input files.
+ if num_partitions > 0:
+ lb, ub = _get_partition_bounds(
+ len(dataset), num_samples_per_partition, num_partitions, partition_id
+ )
+ dataset = dataset[lb:ub]
+
+ # Using the LLaVA config from the MMMU repo.
+ config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
+ for k, v in config.items():
+ if isinstance(v, list):
+ assert len(v) == 1, "only one value supported."
+ config[k] = v[0]
+
+ self._config = config
+
+ self._dataset = dataset
+
+ self._img_h = img_h
+ self._img_w = img_w
+ self._use_tiling = use_tiling
+ self._max_num_tiles = max_num_tiles
+ self._use_thumbnail = use_thumbnail
+ self._prompt_style = prompt_style
+ self._vision_model_type = vision_model_type
+
+ def __len__(self):
+ return len(self._dataset)
+
+ def __getitem__(self, idx):
+ from MMMU.mmmu.utils.data_utils import construct_prompt, process_single_sample
+
+ sample = self._dataset[idx]
+
+ # Use the single image approach from the MMMU repo.
+ if self._prompt_style == "single_image":
+ sample = process_single_sample(sample)
+ sample = construct_prompt(sample, self._config)
+
+ img = sample["image"]
+ sample_imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ self._max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ )
+ sample_num_tiles = [len(sample_imgs)]
+
+ prompt = sample["final_input_prompt"]
+ for i in range(8):
+ prompt = prompt.replace(f"", "")
+ sample["final_input_prompt"] = f"\n{prompt}"
+ elif self._prompt_style == "vlmevalkit":
+ sample = construct_prompt(sample, self._config)
+
+ if sample["question_type"] == "multiple-choice":
+ question = sample["question"]
+
+ options = ""
+ for k, v in sample["index2ans"].items():
+ options += f"{k}. {v}\n"
+
+ final_prompt = f"{question}\n"
+ if "hint" in sample:
+ final_prompt += f"Hint: {sample['hint']}\n"
+
+ if "task_instructions" in sample:
+ final_prompt += f"Task instructions: {sample['task_instructions']}\n"
+
+ final_prompt += options
+ final_prompt += "Answer with the option's letter from the given choices directly."
+
+ sample["final_input_prompt"] = final_prompt.rstrip()
+ else:
+ question = sample["question"]
+ final_prompt = f"{question}\n"
+ final_prompt += "Answer the question directly."
+ sample["final_input_prompt"] = final_prompt.rstrip()
+
+ sample_imgs = []
+ sample_num_tiles = []
+
+ img_indices = sorted(list(set(re.findall(r""
+
+ img = sample[img_key]
+ assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+ imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ adjusted_max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ ) # List of tiles.
+
+ sample_imgs.extend(imgs)
+ sample_num_tiles.append(len(imgs))
+
+ sample["final_input_prompt"] = " ".join([f'' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
+ elif self._prompt_style == "multi_image":
+ sample = construct_prompt(sample, self._config)
+
+ sample_imgs = []
+ sample_num_tiles = []
+
+ img_indices = re.findall(r""
+
+ img = sample[img_key]
+ assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+ # Note: Only replace the current image tag.
+ sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+ img_str, "", 1
+ )
+
+ imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ adjusted_max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ ) # List of tiles.
+
+ sample_imgs.extend(imgs)
+ sample_num_tiles.append(len(imgs))
+
+ # Sanity check.
+ for i in range(1, 8):
+ assert (
+ f"" not in sample["final_input_prompt"]
+ ), "prompt contains unhandled image tags"
+ else:
+ raise ValueError(f"unknown prompt style {self._prompt_style}")
+
+ # MMMU specific metadata.
+ metadata = {"question_type": sample["question_type"]}
+ if sample["question_type"] == "multiple-choice":
+ metadata["index2ans"] = sample["index2ans"]
+ metadata["all_choices"] = sample["all_choices"]
+
+ prompt = sample['final_input_prompt']
+
+ tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
+
+ return (
+ torch.stack(sample_imgs),
+ tile_count,
+ sample["id"],
+ prompt,
+ sample["answer"],
+ metadata,
+ )
+
+
+class VideoMMMEDataset(torch.utils.data.Dataset):
+ "Video MME evaluation dataset."
+
+ def __init__(
+ self,
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ num_frames,
+ vision_model_type,
+ ):
+ ground_truth_original = json.load(open(gt_path))
+ ground_truth = []
+ for gt in ground_truth_original:
+ video_path = gt["url"]
+ video_path = video_path.replace("https://www.youtube.com/watch?v=", "")
+ video_path = video_path.replace("https://m.youtube.com/watch?v=", "")
+ video_path = os.path.join(input_image_path, video_path + ".mp4")
+ if not os.path.exists(video_path):
+ continue
+ gt["video_path"] = video_path
+ ground_truth.append(gt)
+
+ ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
+ print_rank_0(f"Found {len(ground_truth)} videos to process.")
+
+ if num_partitions > 0:
+ start_idx, end_idx = _get_partition_bounds(
+ len(ground_truth), num_samples_per_partition, num_partitions, partition_id
+ )
+ ground_truth = ground_truth[start_idx:end_idx]
+
+ self._ground_truth = ground_truth
+ self._img_h = img_h
+ self._img_w = img_w
+ self._use_tiling = use_tiling
+ self._max_num_tiles = max_num_tiles
+ self._use_thumbnail = use_thumbnail
+ self._num_frames = num_frames
+ self._vision_model_type = vision_model_type
+
+ def __len__(self):
+ return len(self._ground_truth)
+
+ def __getitem__(self, idx):
+ from torchvision.io import read_video
+
+ gt = self._ground_truth[idx]
+
+ video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+ video = video.numpy()
+ selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
+ video_frames = video[selected_frames]
+ if self._num_frames == 1:
+ video_frames = video_frames[None]
+
+ imgs = list(
+ itertools.chain.from_iterable(
+ get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ self._max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ )
+ for img in video_frames
+ )
+ )
+
+ for question in gt["questions"]:
+ # Very hacky, but we essentially re-create gt holding only the
+ # question of interest. This is the make this generation script
+ # compatible with the Video MME evaluation script.
+ question_dict = {
+ "video_id": gt["video_id"],
+ "duration_category": gt["duration_category"],
+ "video_category": gt["video_category"],
+ "video_subcategory": gt["video_subcategory"],
+ "url": gt["url"],
+ "questions": [question],
+ }
+
+ num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+
+ answer = ""
+ metadata = ""
+
+ return (
+ torch.stack(imgs),
+ num_tiles,
+ question["question_id"],
+ question_dict,
+ answer,
+ metadata,
+ )
+
+
+class OCRBenchDataset(torch.utils.data.Dataset):
+ """OCRBench evaluation dataset."""
+
+ def __init__(
+ self,
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ ):
+ gt = json.load(open(gt_path, encoding='utf-8'))
+
+ if num_partitions > 0:
+ start_idx, end_idx = _get_partition_bounds(
+ len(gt), num_samples_per_partition, num_partitions, partition_id
+ )
+ gt = gt[start_idx:end_idx]
+
+ self._input_image_path = input_image_path
+ self._gt = gt
+ self._img_h = img_h
+ self._img_w = img_w
+ self._use_tiling = use_tiling
+ self._max_num_tiles = max_num_tiles
+ self._use_thumbnail = use_thumbnail
+ self._vision_model_type = vision_model_type
+
+ def __len__(self):
+ return len(self._gt)
+
+ def __getitem__(self, idx):
+ img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path'])
+
+ img = Image.open(img_path)
+ imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ self._max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ )
+
+ tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+ metadata = {
+ "dataset_name": self._gt[idx]["dataset_name"],
+ "data_type": self._gt[idx]["type"],
+ }
+
+ return (
+ torch.stack(imgs),
+ tile_count,
+ idx,
+ self._gt[idx]["question"],
+ self._gt[idx]["answers"],
+ metadata,
+ )
+
+
+class MathVistaDataset(torch.utils.data.Dataset):
+ """MathVista evaluation dataset."""
+
+ def __init__(
+ self,
+ input_image_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ ):
+ import datasets
+
+ hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+ assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+ if os.path.exists(input_image_path):
+ dataset = datasets.load_dataset(
+ input_image_path, cache_dir=hf_datasets_cache, verification_mode="no_checks"
+ )
+ else:
+ dataset = datasets.load_dataset(
+ "AI4Math/MathVista", split="testmini", cache_dir=hf_datasets_cache
+ )
+
+ if num_partitions > 0:
+ start_idx, end_idx = _get_partition_bounds(
+ len(dataset), num_samples_per_partition, num_partitions, partition_id
+ )
+ dataset = dataset[start_idx:end_idx]
+
+ self._dataset = dataset
+ self._img_h = img_h
+ self._img_w = img_w
+ self._use_tiling = use_tiling
+ self._max_num_tiles = max_num_tiles
+ self._use_thumbnail = use_thumbnail
+ self._vision_model_type = vision_model_type
+
+ def __len__(self):
+ return len(self._dataset["pid"])
+
+ def __getitem__(self, idx):
+ # Already a PIL object.
+ img = self._dataset['decoded_image'][idx]
+
+ imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ self._max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ )
+
+ tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+ question_id = self._dataset["pid"][idx]
+ question = self._dataset["question"][idx]
+ question_type = self._dataset["question_type"][idx] # free_form or multi_choice
+ query = self._dataset["query"][idx]
+ choices = self._dataset["choices"][idx]
+ answer = self._dataset["answer"][idx]
+
+ if question_type == 'multi_choice':
+ start_chr = 'A'
+ choices_str = ''
+ index2ans = {}
+ all_choices = []
+ for choice in choices:
+ all_choices.append(start_chr)
+ index2ans[start_chr] = choice
+ choices_str += f"{start_chr}. {choice}\n"
+ start_chr = chr(ord(start_chr) + 1)
+
+ question = question + '\n' + choices_str
+ question = question + "Answer with the option's letter from the given choices directly."
+ answer = chr(ord('A') + choices.index(answer))
+ else:
+ question = query.replace("Hint: ", "")
+ index2ans = {}
+ all_choices = []
+
+ metadata = {
+ "question_type": question_type,
+ "index2ans": index2ans,
+ "all_choices": all_choices,
+ }
+
+ return torch.stack(imgs), tile_count, question_id, question, answer, metadata
+
+
+class AI2DDataset(torch.utils.data.Dataset):
+ """AI2D evaluation dataset."""
+
+ def __init__(
+ self,
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ no_mask,
+ vision_model_type,
+ ):
+ with open(gt_path, 'r') as f:
+ jsonl = list(f)
+
+ gt = [json.loads(json_str) for json_str in jsonl]
+
+ if num_partitions > 0:
+ start_idx, end_idx = _get_partition_bounds(
+ len(gt), num_samples_per_partition, num_partitions, partition_id
+ )
+ gt = gt[start_idx:end_idx]
+
+ self._gt = gt
+ self._input_image_path = input_image_path
+ self._img_h = img_h
+ self._img_w = img_w
+ self._use_tiling = use_tiling
+ self._max_num_tiles = max_num_tiles
+ self._use_thumbnail = use_thumbnail
+ self._no_mask = no_mask
+ self._vision_model_type = vision_model_type
+
+ def __len__(self):
+ return len(self._gt)
+
+ def __getitem__(self, idx):
+ img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
+ if self._no_mask:
+ img_path.replace("AI2D_TEST", "AI2D_TEST_NO_MASK_IMAGES")
+
+ img = Image.open(img_path)
+ imgs = get_visual_transform(
+ img,
+ self._img_h,
+ self._img_w,
+ self._use_tiling,
+ self._max_num_tiles,
+ self._use_thumbnail,
+ augment=False,
+ vision_model_type=self._vision_model_type,
+ )
+
+ tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+ metadata = "" # Not used.
+
+ return (
+ torch.stack(imgs),
+ tile_count,
+ self._gt[idx]["question_id"],
+ self._gt[idx]["question"],
+ self._gt[idx]["answer"],
+ metadata,
+ )
+
+
+def get_evaluation_dataset(
+ task,
+ input_image_path,
+ gt_path,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ num_frames,
+ vision_model_type,
+):
+ """Get an evaluation dataset."""
+ if task == "TextVQA":
+ keys = {
+ "image_id": "image_id",
+ "sample_id": "question_id",
+ "question": "question",
+ "answer": "answers",
+ }
+
+ dataset = VQADataset(
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ keys,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ )
+ elif task == "VQAv2":
+ keys = {
+ "image_id": "image",
+ "sample_id": "question_id",
+ "question": "question",
+ "answer": "answer",
+ }
+
+ dataset = VQADataset(
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ keys,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ )
+ elif task == "ChartQA":
+ keys = {"image_id": "imgname", "question": "query", "answer": "label"}
+
+ dataset = VQADataset(
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ keys,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ )
+ elif task == "captioning":
+ dataset = CaptioningDataset(
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ )
+ elif task == 'MMMU':
+ # Note:
+ # - prompt_style="single_image" uses only one image like in the MMMU repo example.
+ # - prompt_style="multi_image" uses multiple input images.
+ # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
+ dataset = MMMUDataset(
+ input_image_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ prompt_style="single_image",
+ vision_model_type=vision_model_type,
+ )
+ elif task == "VideoMME":
+ dataset = VideoMMMEDataset(
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ num_frames,
+ vision_model_type,
+ )
+ elif task == "OCRBench":
+ dataset = OCRBenchDataset(
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ )
+ elif task == "MathVista":
+ dataset = MathVistaDataset(
+ input_image_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ vision_model_type,
+ )
+ elif task == "AI2D":
+ dataset = AI2DDataset(
+ input_image_path,
+ gt_path,
+ num_samples_per_partition,
+ num_partitions,
+ partition_id,
+ img_h,
+ img_w,
+ use_tiling,
+ max_num_tiles,
+ use_thumbnail,
+ no_mask=False,
+ vision_model_type=vision_model_type,
+ )
+ else:
+ raise NotImplementedError(f"unsupported task {task}")
+
+ return dataset
diff --git a/examples/multimodal/image_processing.py b/examples/multimodal/image_processing.py
index a4541576ae..ed9401c679 100644
--- a/examples/multimodal/image_processing.py
+++ b/examples/multimodal/image_processing.py
@@ -1,71 +1,36 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
-import numpy as np
-import torch
-
-from PIL import Image, ImageDraw
from torchvision import transforms as T
-from torchvision.transforms import Compose, RandAugment, RandomResizedCrop, Resize, ToPILImage
-
-
-# Imagenet's mean and std.
-pixel_mean = [123.675, 116.28, 103.53]
-pixel_std = [58.395, 57.12, 57.375]
+from torchvision.transforms import Compose
+from torchvision.transforms.functional import InterpolationMode
-# Reshape for broadcasting.
-pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
-pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
+IMAGENET_PIXEL_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_PIXEL_STD = [0.229, 0.224, 0.225]
+SIGLIP_PIXEL_MEAN = [0.5, 0.5, 0.5]
+SIGLIP_PIXEL_STD = [0.5, 0.5, 0.5]
+CLIP_PIXEL_MEAN = [0.48145466, 0.4578275, 0.40821073]
+CLIP_PIXEL_STD = [0.26862954, 0.26130258, 0.27577711]
-def convert_to_rgb(image):
- return image.convert("RGB")
-def _transform_train_aug(img_h, img_w):
- return Compose([
- ToPILImage(),
- RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0)),
- convert_to_rgb,
- RandAugment(2, 5, isPIL=True, augs=['Identity', 'AutoContrast', 'Brightness', 'Sharpness', 'Equalize',
- 'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),
- ])
+pixel_statistics = {
+ "clip": (CLIP_PIXEL_MEAN, CLIP_PIXEL_STD),
+ "siglip": (SIGLIP_PIXEL_MEAN, SIGLIP_PIXEL_STD),
+ "internvit": (IMAGENET_PIXEL_MEAN, IMAGENET_PIXEL_STD),
+}
-def _transform_test(img_h, img_w):
- return Compose([
- ToPILImage(),
- Resize((img_h, img_w)),
- convert_to_rgb,
- ])
+def get_visual_transform(img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False, vision_model_type="clip"):
+ pixel_mean, pixel_std = pixel_statistics[vision_model_type]
-def standardize_image(img):
- """Standardize image pixel values."""
- return (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std
+ assert not augment, "Image augmentation not implemented."
+ transform = build_transform(img_h, pixel_mean, pixel_std, vision_model_type)
-
-def get_visual_transform(img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False):
if use_tiling:
assert img_h == img_w, "dynamic tiling expects equal tile height and width"
imgs = dynamic_preprocess(img, min_num=1, max_num=max_num_tiles, image_size=img_h, use_thumbnail=use_thumbnail)
- imgs = [standardize_image(img.convert("RGB")) for img in imgs]
+ imgs = [transform(img) for img in imgs]
else:
- img = np.array(img)
- original_h, original_w = img.shape[0], img.shape[1]
- ratio = float(max(img_h, img_w)) / max(original_h, original_w)
- scaled_h, scaled_w = int(original_h * ratio + 0.5), int(original_w * ratio + 0.5)
-
- if augment:
- visual_transform = _transform_train_aug(scaled_h, scaled_w)
- else:
- visual_transform = _transform_test(scaled_h, scaled_w)
-
- img = visual_transform(img)
-
- # Standardize pixel values.
- img = standardize_image(img)
-
- # Pad to target image size.
- delta_h, delta_w = img_h - scaled_h, img_w - scaled_w
- img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
- imgs = [img]
+ imgs = [transform(img)]
return imgs
@@ -128,3 +93,26 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
+
+
+# Based on https://github.com/openai/CLIP/blob/dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1/clip/clip.py#L79
+# and https://github.com/OpenGVLab/InternVL/blob/aa521e6eb1df4cf153aa4118fcf13e673c055d46/internvl_chat/internvl/train/dataset.py#L276
+def build_transform(input_size, pixel_mean, pixel_std, vision_model_type):
+ if vision_model_type in ("siglip", "internvit"):
+ transform = T.Compose([
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+ T.ToTensor(),
+ T.Normalize(mean=pixel_mean, std=pixel_std)
+ ])
+ elif vision_model_type == "clip":
+ transform = Compose([
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+ T.ToTensor(),
+ T.Normalize(mean=pixel_mean, std=pixel_std),
+ ])
+ else:
+ raise NotImplementedError(f"image processing not defined for vision model {vision_model_type}")
+
+ return transform
diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py
index b56e0b07e1..2e07dc808d 100644
--- a/examples/multimodal/layer_specs.py
+++ b/examples/multimodal/layer_specs.py
@@ -12,7 +12,7 @@
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
try:
- from megatron.core.transformer.custom_layers.transformer_engine import (
+ from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TELayerNormColumnParallelLinear,
@@ -28,16 +28,17 @@
import apex
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+ from megatron.core.transformer.torch_norm import WrappedTorchNorm
HAVE_APEX = True
LNImpl = FusedLayerNorm
except ImportError:
import warnings
- from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+ from megatron.core.transformer.torch_norm import WrappedTorchNorm
- warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
- LNImpl = WrappedTorchLayerNorm
+ warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
+ LNImpl = WrappedTorchNorm
def get_layer_spec(is_vit, normalization) -> ModuleSpec:
@@ -45,7 +46,21 @@ def get_layer_spec(is_vit, normalization) -> ModuleSpec:
if normalization == "LayerNorm":
norm = LNImpl
elif normalization == "RMSNorm":
- norm = TENorm
+ if HAVE_TE:
+ norm = TENorm
+ else:
+ version = torch.__version__.split('.')
+ version_geq_2_4 = (
+ int(TORCH_VERSION[0]) > 2
+ or (
+ int(TORCH_VERSION[0]) == 2
+ and int(TORCH_VERSION[1]) >= 4
+ )
+ )
+ assert version_geq_2_4, "Torch version >= 2.4.0 is required for RMSNorm"
+ if HAVE_APEX:
+ warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
+ norm = WrappedTorchNorm
else:
raise RuntimeError("unknown normalization", normalization)
diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py
index b4bab73cfb..a28a428325 100644
--- a/examples/multimodal/model.py
+++ b/examples/multimodal/model.py
@@ -4,11 +4,11 @@
import torch
from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
-from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec
+from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec, get_norm_mlp_module_spec_te
-from megatron.core.models.multimodal.llava_model import LLaVAModel
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN, LLaVAModel
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
-from megatron.training import get_args, print_rank_0
+from megatron.training import get_args, get_tokenizer, print_rank_0
from megatron.training.arguments import core_transformer_config_from_args
@@ -30,14 +30,22 @@ def model_provider(
model: A multimodal model.
"""
args = get_args()
+ assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
+ assert args.encoder_pipeline_model_parallel_size <= 1, "LLaVA does not support pp>1 for encoder on it's own pipeline rank"
use_te = args.use_te
print_rank_0('building a multimodal model ...')
num_image_embeddings = get_num_image_embeddings(
- args.img_h, args.img_w, args.patch_dim, args.vision_model_type,
- args.disable_vision_class_token, 1
+ args.img_h,
+ args.img_w,
+ args.patch_dim,
+ args.vision_model_type,
+ args.disable_vision_class_token,
+ 1,
+ args.pixel_shuffle,
+ args.use_tile_tags,
)
old_seq_length = args.seq_length
args.seq_length = args.encoder_seq_length = num_image_embeddings
@@ -92,6 +100,9 @@ def model_provider(
vision_transformer_layer_spec = get_layer_spec(
is_vit=True, normalization=vision_config.normalization
)
+ elif vision_model_type == "internvit":
+ from nvlm.internvit import get_internvit_layer_spec
+ vision_transformer_layer_spec = get_internvit_layer_spec(use_te=use_te)
else:
raise RuntimeError("unsupported vision model type", vision_model_type)
@@ -100,21 +111,49 @@ def model_provider(
vision_projection_config, language_config.hidden_size
)
+ # --encoder-pipeline-model-parallel-size 1 will enable a separate pipeline stage for the vision model.
if args.encoder_pipeline_model_parallel_size > 0:
assert (
args.encoder_pipeline_model_parallel_size == 1
), "vision model and projection can only live on 1 pipeline stage."
- vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
- vision_projection_config.pipeline_model_parallel_size = (
- args.encoder_pipeline_model_parallel_size
- )
+
if args.encoder_tensor_model_parallel_size > 0:
vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
vision_projection_config.tensor_model_parallel_size = (
args.encoder_tensor_model_parallel_size
)
- vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+ # Make sure vision model pipeline parallel size is not inherited from the language model pipeline parallel size.
+ # 0 is not a valid for the config value, hence max(1, ).
+ vision_config.pipeline_model_parallel_size = max(1, args.encoder_pipeline_model_parallel_size)
+ vision_projection_config.pipeline_model_parallel_size = vision_config.pipeline_model_parallel_size
+
+ # Make sure the vision model does not inherit first and last pipeline num layers from the language model.
+ vision_config.first_pipeline_num_layers = vision_config.last_pipeline_num_layers = None
+
+ if vision_projection_config.normalization:
+ vision_projection_layer_spec = get_norm_mlp_module_spec_te().submodules
+ else:
+ vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+
+ # Toggle --recompute* for the vision and language model separately.
+ if args.recompute_vision:
+ if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
+ vision_config.recompute_num_layers = vision_config.num_layers
+ else:
+ vision_config.recompute_granularity = None
+ vision_config.recompute_method = None
+ vision_config.recompute_num_layers = None
+
+ vision_projection_config.recompute_granularity = None
+ vision_projection_config.recompute_method = None
+ vision_projection_config.recompute_num_layers = None
+
+
+ tokenizer = get_tokenizer()
+ image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+
+ tile_tags = _get_tile_tags(args, tokenizer)
model = LLaVAModel(
language_transformer_config=language_config,
@@ -139,6 +178,10 @@ def model_provider(
img_w=args.img_w,
patch_dim=args.patch_dim,
language_rotary_base=args.rotary_base,
+ language_rope_scaling=args.use_rope_scaling,
+ image_token_index=image_token_index,
+ pixel_shuffle=args.pixel_shuffle,
+ tile_tags=tile_tags,
)
model.freeze(
@@ -148,3 +191,26 @@ def model_provider(
)
return model
+
+
+def _get_tile_tags(args, tokenizer):
+ """Tile tags are used in NVLM to surround image tiles with text tags."""
+ if not args.use_tile_tags:
+ return None
+
+ # We expect the tokenized length of the tags is same.
+ thumbnail_tag_text = ""
+ if args.tokenizer_prompt_format == "nvlm-yi-34b":
+ thumbnail_tag_text = ""
+
+ assert args.max_num_tiles <= 6, "Up to 6 tile tags used"
+ tile_tags_text = [f"" for i in range(1, args.max_num_tiles + 1)] + [thumbnail_tag_text]
+
+ start_idx = 0
+ if tokenizer._prompt_config.has_bos:
+ start_idx = 1
+
+ # Convert to tokens [num_tiles, tile_seq_len].
+ tile_tags = [tokenizer.tokenize(t)[start_idx:] for t in tile_tags_text]
+
+ return tile_tags
diff --git a/examples/multimodal/clip_converter.py b/examples/multimodal/model_converter/clip_converter.py
similarity index 100%
rename from examples/multimodal/clip_converter.py
rename to examples/multimodal/model_converter/clip_converter.py
diff --git a/examples/multimodal/model_converter/internvit_converter.py b/examples/multimodal/model_converter/internvit_converter.py
new file mode 100755
index 0000000000..48404c2084
--- /dev/null
+++ b/examples/multimodal/model_converter/internvit_converter.py
@@ -0,0 +1,162 @@
+import argparse
+import os
+
+import torch
+from transformers import AutoModel
+
+
+def convert(model_name, output_path, tensor_parallel_size, use_te):
+ """Convert InternViT HF checkpoint to mcore."""
+ hf_model = AutoModel.from_pretrained(
+ model_name,
+ trust_remote_code=True
+ )
+
+ hf_state_dict = hf_model.state_dict()
+ new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+ hidden_size = 3200
+ num_heads = 25
+ dim = 128
+
+ order = torch.ones(3 * hidden_size).long()
+
+ for j in range(num_heads):
+ for i in range(dim):
+ order[i + dim*3*j] = j*dim+i
+ order[dim + i + dim*3*j] = j*dim+i+num_heads*dim
+ order[dim*2 + i + dim*3*j] = j*dim+i+num_heads*dim*2
+
+ for name, tensor in hf_state_dict.items():
+ # Map parameter names to ones used in megatron.
+ new_name = ""
+ new_tensor = tensor
+
+ # This is used for chunking some tensors to target tensor parallel size.
+ chunk_dim = None
+
+ if "embeddings.class_embedding" in name:
+ new_name = "class_token"
+ elif "embeddings.patch_embedding.weight" in name:
+ new_name = "conv1.weight"
+ elif "embeddings.patch_embedding.bias" in name:
+ new_name = "conv1.bias"
+ elif "embeddings.position_embedding" in name:
+ new_name = "position_embeddings.weight"
+ new_tensor = new_tensor.squeeze(0)
+ elif "encoder.layers" in name:
+ layer_idx = name.split(".")[2]
+
+ base = f"decoder.layers.{layer_idx}"
+
+ head_dim = 128
+
+ if tensor_parallel_size == 1:
+ num_padded_heads = 25
+ elif tensor_parallel_size == 8:
+ # Note: 25 is not divisible by 8 and we don't currently support uneven heads split with tensor parallelism.
+ # So we pad with dummy all-zero heads. Please use a nice even number of attention heads in your model.
+ num_padded_heads = 32
+ else:
+ raise NotImplementedError("invalid tensor parallel size value:", tensor_parallel_size)
+
+ if "ls1" in name:
+ new_name = f"{base}.ls1"
+ elif "ls2" in name:
+ new_name = f"{base}.ls2"
+ elif "attn.qkv.weight" in name:
+ new_name = f"{base}.self_attention.linear_qkv.weight"
+ num_tensors = 3
+ padded_dim = head_dim * num_padded_heads * num_tensors
+ padded_tensor = torch.zeros((padded_dim, new_tensor.shape[-1]), dtype=new_tensor.dtype, device=new_tensor.device)
+ padded_tensor[:new_tensor.shape[0], :] = new_tensor[order]
+ new_tensor = padded_tensor
+ chunk_dim = 0
+ elif "attn.q_norm.weight" in name:
+ new_name = f"{base}.self_attention.q_layernorm.weight"
+ num_tensors = 1
+ padded_dim = head_dim * num_padded_heads * num_tensors
+ padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+ padded_tensor[:new_tensor.shape[0]] = new_tensor
+ new_tensor = padded_tensor
+ chunk_dim = 0
+ elif "attn.k_norm.weight" in name:
+ new_name = f"{base}.self_attention.k_layernorm.weight"
+ num_tensors = 1
+ padded_dim = head_dim * num_padded_heads * num_tensors
+ padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
+ padded_tensor[:new_tensor.shape[0]] = new_tensor
+ new_tensor = padded_tensor
+ chunk_dim = 0
+ elif "attn.proj.weight" in name:
+ new_name = f"{base}.self_attention.linear_proj.weight"
+ num_tensors = 1
+ padded_dim = head_dim * num_padded_heads * num_tensors
+ padded_tensor = torch.zeros((new_tensor.shape[0], padded_dim), dtype=new_tensor.dtype, device=new_tensor.device)
+ padded_tensor[:, :new_tensor.shape[-1]] = new_tensor
+ new_tensor = padded_tensor
+ chunk_dim = 1
+ elif "attn.proj.bias" in name:
+ new_name = f"{base}.self_attention.linear_proj.bias"
+ elif "mlp.fc1.weight" in name:
+ new_name = f"{base}.mlp.linear_fc1.weight"
+ chunk_dim = 0
+ elif "mlp.fc1.bias" in name:
+ new_name = f"{base}.mlp.linear_fc1.bias"
+ chunk_dim = 0
+ elif "mlp.fc2.weight" in name:
+ new_name = f"{base}.mlp.linear_fc2.weight"
+ chunk_dim = 1
+ elif "mlp.fc2.bias" in name:
+ new_name = f"{base}.mlp.linear_fc2.bias"
+ elif "norm1" in name:
+ new_name = f"{base}.input_layernorm.weight"
+ elif "norm2" in name:
+ new_name = f"{base}.pre_mlp_layernorm.weight"
+ else:
+ raise RuntimeError("unexpected transformer layer name", name)
+ else:
+ raise RuntimeError("unexpected layer name", name)
+
+ assert new_name != "", f"unexpected layer name {name}"
+
+ # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+ extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+ is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+ if use_te and is_extra_state_layer:
+ layer = new_name.split(".")[-2]
+ if layer in extra_state_layers:
+ extra_state_name = (
+ new_name[: new_name.rfind(".") + 1] + "_extra_state"
+ ) # Replace the weight name.
+ for i in range(tensor_parallel_size):
+ new_state_dicts[i]["model"][extra_state_name] = None
+
+ if chunk_dim is None:
+ new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+ else:
+ new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+ for i in range(tensor_parallel_size):
+ new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+ for i in range(tensor_parallel_size):
+ output_dir_tp = os.path.join(output_path, f"iter_0000001/mp_rank_0{i}")
+ os.makedirs(output_dir_tp, exist_ok=True)
+ output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+ torch.save(new_state_dicts[i], output_path_tp)
+ print("saved file", output_path_tp)
+
+ print("done")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="InternVIT HuggingFace to Mcore converter")
+ parser.add_argument("--model-name", type=str, default="OpenGVLab/InternViT-6B-448px-V1-5", help="Model name in HuggingFace")
+ parser.add_argument("--output-dir", type=str, required=True, help="Output directory for the mcore model.")
+ parser.add_argument("--use-te", action="store_true", default=True)
+ parser.add_argument("--tensor-parallel-size", type=int, required=True)
+
+ args = parser.parse_args()
+
+ convert(args.model_name, args.output_dir, args.tensor_parallel_size, args.use_te)
diff --git a/examples/multimodal/model_converter/siglip_converter.py b/examples/multimodal/model_converter/siglip_converter.py
new file mode 100644
index 0000000000..666cda15eb
--- /dev/null
+++ b/examples/multimodal/model_converter/siglip_converter.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+from transformers import PaliGemmaForConditionalGeneration
+import torch
+
+
+def convert(output_path, tensor_parallel_size, use_te):
+ device = "cuda"
+
+ model_id = "google/paligemma-3b-pt-448"
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
+
+ model = model.to(device)
+
+ print(model.config)
+ for name, tensor in model.state_dict().items():
+ if "vision_model" not in name:
+ continue
+ shape_str = "(" + ", ".join([str(x) for x in tensor.shape]) + ")"
+ print(f"{name:<75} {shape_str:>20}")
+
+ state_dict = model.state_dict()
+ new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+ def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
+ if chunk_dim is None:
+ new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+ else:
+ new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+ for i in range(tensor_parallel_size):
+ # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+ new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+ # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+ extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+ is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+ if use_te and is_extra_state_layer:
+ layer = new_name.split(".")[-2]
+ if layer in extra_state_layers:
+ extra_state_name = (
+ new_name[: new_name.rfind(".") + 1] + "_extra_state"
+ ) # Replace the weight name.
+ new_state_dicts[i]["model"][extra_state_name] = None
+
+ for name, tensor in state_dict.items():
+ if tensor.dtype == torch.float16:
+ state_dict[name] = tensor.to(torch.float32)
+
+ add_chunck_tensor(
+ state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"],
+ "position_embeddings.weight")
+ add_chunck_tensor(
+ state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"],
+ "conv1.weight")
+ add_chunck_tensor(
+ state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"],
+ "conv1.bias")
+
+ head_dim = 72
+ num_head = 16
+ for layer_idx in range(27):
+ origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
+ target_base = f"decoder.layers.{layer_idx}"
+
+ for param_type in ["weight", "bias"]:
+ # QKV
+ q_proj_params = state_dict[f"{origin_base}.self_attn.q_proj.{param_type}"]
+ k_proj_params = state_dict[f"{origin_base}.self_attn.k_proj.{param_type}"]
+ v_proj_params = state_dict[f"{origin_base}.self_attn.v_proj.{param_type}"]
+ # Do some tensor manipulation because megatron expect one tensor
+ # projection for the QKV in the order
+ # [(Q1, K1, V1), (Q2, K2, V2), ...] where Qi is the query of the
+ # i-th head with dimension num_head.
+ new_tensor = torch.concatenate([
+ q_proj_params.view(num_head, head_dim, -1),
+ k_proj_params.view(num_head, head_dim, -1),
+ v_proj_params.view(num_head, head_dim, -1)], axis=1).view(
+ 3*head_dim*num_head, -1)
+ if param_type == "bias":
+ new_tensor = new_tensor[:, 0]
+ new_name = f"{target_base}.self_attention.linear_qkv.{param_type}"
+ add_chunck_tensor(new_tensor, new_name, chunk_dim=0)
+ # linear_proj
+ add_chunck_tensor(
+ state_dict[f"{origin_base}.self_attn.out_proj.{param_type}"],
+ f"{target_base}.self_attention.linear_proj.{param_type}",
+ chunk_dim=1 if param_type == "weight" else None)
+ # layer_norm
+ new_name = f"{target_base}.input_layernorm.{param_type}"
+ if use_te:
+ new_name = f"{target_base}.self_attention.linear_qkv.layer_norm_{param_type}"
+ add_chunck_tensor(
+ state_dict[f"{origin_base}.layer_norm1.{param_type}"],
+ new_name)
+ # FC 1
+ add_chunck_tensor(
+ state_dict[f"{origin_base}.mlp.fc1.{param_type}"],
+ f"{target_base}.mlp.linear_fc1.{param_type}",
+ chunk_dim=0)
+ # FC 2
+ add_chunck_tensor(
+ state_dict[f"{origin_base}.mlp.fc2.{param_type}"],
+ f"{target_base}.mlp.linear_fc2.{param_type}",
+ chunk_dim=1 if param_type=="weight" else None)
+ # layer_norm
+ new_name = f"{target_base}.pre_mlp_layernorm.{param_type}"
+ if use_te:
+ new_name = f"{target_base}.mlp.linear_fc1.layer_norm_{param_type}"
+ add_chunck_tensor(
+ state_dict[f"{origin_base}.layer_norm2.{param_type}"],
+ new_name)
+
+ add_chunck_tensor(
+ state_dict["vision_tower.vision_model.post_layernorm.weight"],
+ "ln_post.weight")
+ add_chunck_tensor(
+ state_dict["vision_tower.vision_model.post_layernorm.bias"],
+ "ln_post.bias")
+
+ for i in range(tensor_parallel_size):
+ output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+ os.makedirs(output_dir_tp)
+ output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+ torch.save(new_state_dicts[i], output_path_tp)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""
+Convert SigLIP weights to megatron format.
+
+
+Example usage:
+python siglip_converter.py --tensor-parallel-size 4 --output google_paligemma_3b_pt_44_mcore_tp_4 --use-te
+
+examples/multimodal/combine_mistral_clip.sh Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
+""",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument(
+ "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+ )
+ parser.add_argument(
+ "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+ )
+ parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+
+ args = parser.parse_args()
+
+ convert(args.output, args.tensor_parallel_size, args.use_te)
+
+ print("done.")
diff --git a/examples/multimodal/model_converter/vision_model_tester.py b/examples/multimodal/model_converter/vision_model_tester.py
new file mode 100644
index 0000000000..ef36dd5f9e
--- /dev/null
+++ b/examples/multimodal/model_converter/vision_model_tester.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+import sys
+
+# Add megatron and the multimodal example to the path.
+sys.path.append(
+ os.path.abspath(
+ os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir)
+ )
+)
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+
+import torch
+from transformers import AutoModel
+
+from examples.multimodal.model import model_provider
+from examples.multimodal.multimodal_args import add_multimodal_extra_args
+from megatron.training import get_model
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def run_mcore_vision(model_path):
+ """Run mcore vision model."""
+ os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+ # Megatron has some mandatory flags.
+ sys.argv = [
+ "ignore_me.py",
+ "--micro-batch-size=1",
+ "--num-layers=2",
+ "--vision-model-type=internvit",
+ "--language-model-type=mistral_7b",
+ "--tokenizer-prompt-format=mistral",
+ "--tokenizer-type=MultimodalTokenizer",
+ "--tokenizer-model=mistralai/Mistral-7B-Instruct-v0.3",
+ "--vocab-size=1024",
+ "--hidden-size=64",
+ "--num-attention-heads=8",
+ "--seq-length=1024",
+ "--decoder-seq-length=2048",
+ "--max-position-embeddings=2048",
+ "--bf16",
+ "--img-h=448",
+ "--img-w=448",
+ "--patch-dim=14",
+ "--tensor-model-parallel-size=8",
+ "--use-te",
+ f"--pretrained-checkpoint={model_path}",
+ ]
+
+ initialize_megatron(extra_args_provider=add_multimodal_extra_args)
+
+ def wrapped_model_provider(pre_process, post_process):
+ return model_provider(pre_process, post_process, parallel_output=False)
+
+ # Set up model and load checkpoint.
+ model = get_model(wrapped_model_provider, wrap_with_ddp=False)
+
+ vision_model = model[0].module.vision_model
+
+ load_checkpoint([vision_model], None, None)
+
+ vision_model.eval()
+
+ images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
+
+ output = vision_model(images)
+
+ return output
+
+
+def run_hf_vision(model_name):
+ """Run HF vision model."""
+ model = (
+ AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
+ .cuda()
+ .eval()
+ )
+
+ images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
+
+ outputs = model(images, return_dict=True)
+
+ return outputs
+
+
+def main(mcore_model, hf_model):
+ """Compare vision model outputs between mcore and HF given the same fixed input."""
+ mcore = run_mcore_vision(mcore_model)
+
+ if torch.distributed.get_rank() == 0:
+ hf = run_hf_vision(hf_model)
+ hf = hf["last_hidden_state"]
+
+ # Compare logits. Due to different attention implementations and other details,
+ # there will be numerical differences.
+ diff = (mcore - hf).abs()
+ mean_diff = diff.mean().item()
+ max_diff = diff.max().item()
+ print(f"mean diff {mean_diff}, max diff {max_diff}")
+ assert mean_diff < 0.1, "mean output difference is greater than expected"
+ assert max_diff < 50, "max output difference is greater than expected"
+
+ print("lgtm")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Check mcore vision model output vs. HF numerically.",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument(
+ "--mcore-model", type=str, required=True, help="directory for mcore model weights"
+ )
+ parser.add_argument("--hf-model", type=str, required=True, help="Model name in HF")
+
+ args = parser.parse_args()
+
+ main(args.mcore_model, args.hf_model)
diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py
index a7cb4235e3..eb56118e71 100644
--- a/examples/multimodal/multimodal_args.py
+++ b/examples/multimodal/multimodal_args.py
@@ -1,4 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
def add_multimodal_extra_args(parser):
@@ -39,5 +40,40 @@ def add_multimodal_extra_args(parser):
group.add_argument(
"--online-evaluation-config", type=str, help="Config file for online evaluation."
)
+ group.add_argument(
+ "--special-tokens",
+ nargs="*",
+ default=[IMAGE_TOKEN],
+ help="Special tokens used in the multimodal model",
+ )
+ group.add_argument(
+ "--tokenizer-prompt-format",
+ type=str,
+ choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
+ required=True,
+ help="Prompt format to use with the tokenizer.",
+ )
+ group.add_argument("--pixel-shuffle", action="store_true", default=False)
+ group.add_argument(
+ "--image-tag-type",
+ type=str,
+ choices=["nvlm", "internvl", ""],
+ default="", # Default: Image tag not used.
+ help="Surround image tokens with tags.",
+ )
+ group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags")
+ group.add_argument(
+ "--packing-buffer-size",
+ type=int,
+ default=None, # Packing is disabled by default.
+ help="Enable sample packing by setting the buffer size to > 0",
+ )
+ group.add_argument(
+ "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
+ )
+ group.add_argument(
+ "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
+ )
+
return parser
diff --git a/examples/multimodal/nvlm/README.md b/examples/multimodal/nvlm/README.md
new file mode 100644
index 0000000000..7eddbb7efa
--- /dev/null
+++ b/examples/multimodal/nvlm/README.md
@@ -0,0 +1,100 @@
+NVLM
+====
+
+Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
+
+*NOTE: VLMs in Megatron are under active development and are expected to change.*
+
+# Setup
+
+## Docker image
+
+Please use `examples/multimodal/Dockerfile`.
+
+## Dataset preparation
+
+Please refer to Tables 4 and 6 in the [NVLM paper](https://arxiv.org/pdf/2409.11402) for full list of pretrain and SFT datasets.
+Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
+
+## Model conversion
+
+### Vision model
+
+NVLM 1.0 models use [OpenGVLab/InternViT-6B-448px-V1-5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python examples/multimodal/model_converter/internvit_converter.py --output-dir --use-te --tensor-parallel-size 8
+```
+
+### 34B Language model
+
+NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) from HuggingFace.
+Please download it and run the following command to convert it to Megatron format.
+```
+python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
+ --load-dir --save-dir