rebased to main

Signed-off-by: dorotat-nv <dorotat@nvidia.com>
NVIDIA · Jan 6, 2025 · d36a16d · d36a16d
2 parents 125d17e + f12a475
commit d36a16d
Show file tree

Hide file tree

Showing 58 changed files with 2,795 additions and 1,108 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -24,7 +24,7 @@
 		"NUMBA_CACHE_DIR": "/tmp/"
 	},
 	"postCreateCommand": "./.devcontainer/postCreateCommand.sh",
-	"remoteUser": "bionemo",
+	"remoteUser": "ubuntu",
 	"customizations": {
 		"vscode": {
 			"extensions": [

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,6 @@
 docs/site/
 *.nemo
 protein/
-singlecell/
 results/
 
 # Local configs

diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -93,4 +93,4 @@ sub-packages/bionemo-geneformer @jstjohn @malcolmgreaves @skothenhill-nv
 
 sub-packages/bionemo-scdl @jstjohn @malcolmgreaves @polinabinder1 @skothenhill-nv
 
-sub-packages/bionemo-noodles @skothenhill-nv @malcolmgreaves @jstjohn @edawson
+sub-packages/bionemo-noodles @skothenhill-nv @malcolmgreaves @jstjohn @edawson @cspades
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,18 @@
 # Base image with apex and transformer engine, but without NeMo or Megatron-LM.
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
+#  Note that the core NeMo docker container is defined here:
+#   https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/llm_train/Dockerfile.train
+#  with settings that get defined/injected from this config:
+#   https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/.gitlab-ci.yml
+#  We should keep versions in our container up to date to ensure that we get the latest tested perf improvements and
+#   training loss curves from NeMo.
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3
 
-FROM rust:1.82.0 as rust-env
+FROM rust:1.82.0 AS rust-env
 
 RUN rustup set profile minimal && \
-    rustup install 1.82.0 && \
-    rustup target add x86_64-unknown-linux-gnu && \
-    rustup default 1.82.0
+  rustup install 1.82.0 && \
+  rustup target add x86_64-unknown-linux-gnu && \
+  rustup default 1.82.0
 
 FROM ${BASE_IMAGE} AS bionemo2-base
 
@@ -25,7 +31,7 @@ RUN git clone https://github.com/NVIDIA/apex.git && \
   --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm"
 
 # Transformer Engine pre-1.7.0. 1.7 standardizes the meaning of bits in the attention mask to match
-ARG TE_COMMIT=c27ee60ec746210bcea4ec33958dbbff06706506
+ARG TE_COMMIT=2215fa5c7557b66034068816020f9f611019e457
 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
   cd TransformerEngine && \
   git fetch origin ${TE_COMMIT} && \
@@ -49,11 +55,11 @@ RUN apt-get install -y gnupg
 # Check the nemo dependency for causal conv1d and make sure this checkout
 # tag matches. If not, update the tag in the following line.
 RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-dir install \
-  git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.0.post2
+  git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
 
 # Mamba dependancy installation
 RUN pip --disable-pip-version-check --no-cache-dir install \
-  git+https://github.com/state-spaces/mamba.git@v2.0.3
+  git+https://github.com/state-spaces/mamba.git@v2.2.2
 
 RUN pip install hatchling   # needed to install nemo-run
 ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
@@ -67,30 +73,23 @@ RUN rm -rf /build
 
 # Addressing Security Scan Vulnerabilities
 RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
-RUN apt-get update  && \
-  apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
-  rm -rf /var/lib/apt/lists/*
-RUN apt purge -y libslurm37 libpmi2-0 && \
-  apt autoremove -y
-RUN source /usr/local/nvm/nvm.sh && \
-  NODE_VER=$(nvm current) && \
-  nvm deactivate && \
-  nvm uninstall $NODE_VER && \
-  sed -i "/NVM/d" /root/.bashrc && \
-  sed -i "/nvm.sh/d" /etc/bash.bashrc
+
 
 # Use UV to install python packages from the workspace. This just installs packages into the system's python
-# environment, and does not use the current uv.lock file.
+# environment, and does not use the current uv.lock file. Note that with python 3.12, we now need to set
+# UV_BREAK_SYSTEM_PACKAGES, since the pytorch base image has made the decision not to use a virtual environment and UV
+# does not respect the PIP_BREAK_SYSTEM_PACKAGES environment variable set in the base dockerfile.
 COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
 ENV UV_LINK_MODE=copy \
   UV_COMPILE_BYTECODE=1 \
   UV_PYTHON_DOWNLOADS=never \
-  UV_SYSTEM_PYTHON=true
+  UV_SYSTEM_PYTHON=true \
+  UV_NO_CACHE=1 \
+  UV_BREAK_SYSTEM_PACKAGES=1
 
-# Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their
+# Install the bionemo-geometric requirements ahead of copying over the rest of the repo, so that we can cache their
 # installation. These involve building some torch extensions, so they can take a while to install.
 RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \
-  --mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked \
   uv pip install --no-build-isolation -r /requirements-pyg.txt
 
 WORKDIR /workspace/bionemo2
@@ -107,19 +106,32 @@ ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
 ENV RUSTUP_HOME="/usr/local/rustup"
 
 # Note, we need to mount the .git folder here so that setuptools-scm is able to fetch git tag for version.
+# Includes a hack to install tensorstore 0.1.45, which doesn't distribute a pypi wheel for python 3.12, and the metadata
+# in the source distribution doesn't match the expected pypi version.
 RUN --mount=type=bind,source=./.git,target=./.git \
   --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
   --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
   <<EOF
 set -eo pipefail
-uv pip install maturin --no-build-isolation && uv pip install --no-build-isolation \
+uv pip install maturin --no-build-isolation
+
+pip install --use-deprecated=legacy-resolver  --no-build-isolation \
+  tensorstore==0.1.45
+sed -i 's/^Version: 0\.0\.0$/Version: 0.1.45/' \
+  /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info/METADATA
+mv /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info \
+/usr/local/lib/python3.12/dist-packages/tensorstore-0.1.45.dist-info
+
+uv pip install --no-build-isolation \
   ./3rdparty/* \
   ./sub-packages/bionemo-* \
   -r /requirements-cve.txt \
   -r /requirements-test.txt
+
 rm -rf ./3rdparty
 rm -rf /tmp/*
 rm -rf ./sub-packages/bionemo-noodles/target
+rm -rf /root/.cache/*
 EOF
 
 # In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
@@ -138,35 +150,32 @@ apt-get install -qyy \
 rm -rf /tmp/* /var/tmp/*
 EOF
 
-# Create a non-root user to use inside a devcontainer.
-ARG USERNAME=bionemo
-ARG USER_UID=1000
-ARG USER_GID=$USER_UID
-RUN groupadd --gid $USER_GID $USERNAME \
-  && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
-  && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
+# Use a non-root user to use inside a devcontainer (with ubuntu 23 and later, we can use the default ubuntu user).
+ARG USERNAME=ubuntu
+RUN echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
   && chmod 0440 /etc/sudoers.d/$USERNAME
 
 # Here we delete the dist-packages directory from the pytorch base image, and copy over the dist-packages directory from
 # the build image. This ensures we have all the necessary dependencies installed (megatron, nemo, etc.).
 RUN <<EOF
   set -eo pipefail
-  rm -rf /usr/local/lib/python3.10/dist-packages
-  mkdir -p /usr/local/lib/python3.10/dist-packages
-  chmod 777 /usr/local/lib/python3.10/dist-packages
+  rm -rf /usr/local/lib/python3.12/dist-packages
+  mkdir -p /usr/local/lib/python3.12/dist-packages
+  chmod 777 /usr/local/lib/python3.12/dist-packages
   chmod 777 /usr/local/bin
 EOF
 
 USER $USERNAME
 
 COPY --from=bionemo2-base --chown=$USERNAME:$USERNAME --chmod=777 \
-  /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+  /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages
 
 COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
 ENV UV_LINK_MODE=copy \
   UV_COMPILE_BYTECODE=0 \
   UV_PYTHON_DOWNLOADS=never \
-  UV_SYSTEM_PYTHON=true
+  UV_SYSTEM_PYTHON=true \
+  UV_BREAK_SYSTEM_PACKAGES=1
 
 # Bring in the rust toolchain, as maturin is a dependency listed in requirements-dev
 COPY --from=rust-env /usr/local/cargo /usr/local/cargo
@@ -184,7 +193,7 @@ EOF
 
 RUN <<EOF
   set -eo pipefail
-  rm -rf /usr/local/lib/python3.10/dist-packages/bionemo*
+  rm -rf /usr/local/lib/python3.12/dist-packages/bionemo*
   pip uninstall -y nemo_toolkit megatron_core
 EOF
 
@@ -208,9 +217,6 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
 ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
 ENV RUSTUP_HOME="/usr/local/rustup"
 
-RUN uv pip uninstall maturin
-RUN uv pip install maturin --no-build-isolation
-
 RUN <<EOF
 set -eo pipefail
 find . -name __pycache__ -type d -print | xargs rm -rf
@@ -220,8 +226,8 @@ for sub in ./3rdparty/* ./sub-packages/bionemo-*; do
 done
 EOF
 
-# Since the entire repo is owned by root, swithcing username for development breaks things.
-ARG USERNAME=bionemo
+# Since the entire repo is owned by root, switching username for development breaks things.
+ARG USERNAME=ubuntu
 RUN chown $USERNAME:$USERNAME -R /workspace/bionemo2/
 USER $USERNAME
 

diff --git a/Dockerfile.arm b/Dockerfile.arm
@@ -312,6 +312,7 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
 
 
 # RUN rm -rf /usr/local/cargo /usr/local/rustup
+RUN rm -rf /root/.cache/bazel
 RUN chmod 777 -R /workspace/bionemo2/
 
 # Transformer engine attention defaults

diff --git a/README.md b/README.md
@@ -279,10 +279,10 @@ type, and then pass in the config type to the training recipe.
 Similar to ESM-2, you can download the dataset and checkpoint through our utility function.
 
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 GENEFORMER_10M_CKPT=$(download_bionemo_data geneformer/10M_240530:2.0 --source $MY_DATA_SOURCE); \
 train_geneformer     \
-    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data    \
+    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl    \
     --result-dir ./results     \
     --restore-from-checkpoint-path ${GENEFORMER_10M_CKPT} \
     --experiment-name test_experiment     \
@@ -305,9 +305,9 @@ copy the `sub-projects/bionemo-geneformer/geneformer/scripts/train_geneformer.py
 Simple fine-tuning example (**NOTE**: please change `--restore-from-checkpoint-path` to be the checkpoint directory path that was output last
 by the previous train run)
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 train_geneformer     \
-    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data    \
+    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl    \
     --result-dir ./results     \
     --experiment-name test_finettune_experiment     \
     --num-gpus 1  \
@@ -331,11 +331,11 @@ customizations for your task.
 
 
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 bionemo-geneformer-recipe \
-    --recipe geneformer_10m_pretrain_recipe \
-    --dest my_config.yaml \
-    --data-path ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data \
+    --recipe 10m-pretrain \
+    --dest my_config.json \
+    --data-path ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl \
     --result-dir ./results
 ```
 > ⚠️ **IMPORTANT:** Inspect and edit the contents of the outputted my_config.yaml as you see fit

diff --git a/ci/scripts/run_pytest.sh b/ci/scripts/run_pytest.sh
@@ -19,6 +19,8 @@ set -xueo pipefail
 export PYTHONDONTWRITEBYTECODE=1
 # NOTE: if a non-nvidia user wants to run the test suite, just run `export BIONEMO_DATA_SOURCE=ngc` prior to this call.
 export BIONEMO_DATA_SOURCE="${BIONEMO_DATA_SOURCE:-pbss}"
+# flexible GPU memory management, reducing the risk of fragmentation-related CUDA OOM
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 source "$(dirname "$0")/utils.sh"
 
 if ! set_bionemo_home; then
@@ -27,12 +29,16 @@ fi
 
 python -m coverage erase
 
+error=false
 for dir in docs/ ./sub-packages/bionemo-*/; do
     echo "Running pytest in $dir"
     python -m coverage run --parallel-mode --source=bionemo \
-    -m pytest -v --nbval-lax --durations=0 --durations-min=60.0 --ignore-glob='*docs/docs/user-guide/examples/bionemo-esm2/mutant-design.ipynb' "$dir"
-
+    -m pytest -v --nbval-lax --durations=0 --durations-min=60.0 "$dir" || error=true
 done
 
 python -m coverage combine
 python -m coverage report --show-missing
+
+if [ "$error" = true ]; then
+    exit 1
+fi
diff --git a/docs/docs/user-guide/appendix/releasenotes-fw.md b/docs/docs/user-guide/appendix/releasenotes-fw.md
@@ -21,6 +21,8 @@
   * Moved inference script to a new executable `infer_esm2`, and deprecated the inference example in the fine-tuning tutorial.
   * Added new Jupyter notebook tutorials for inference and zero-shot protein design. These notebooks can be deployed on the cloud resources as a [brev.dev](https://www.brev.dev/) launchable.
 
+###  Known Issues:
+* Loading a checkpoint for Geneformer inference on H100 has a known regression in accuracy. Work is in progress to resolve by next release.
 
 ## BioNeMo Framework v2.1
 

diff --git a/docs/docs/user-guide/contributing/Writing Documentation/jupyter-notebooks.ipynb b/docs/docs/user-guide/contributing/Writing Documentation/jupyter-notebooks.ipynb
diff --git a/docs/docs/user-guide/examples/bionemo-esm2/finetune.md b/docs/docs/user-guide/examples/bionemo-esm2/finetune.md
@@ -230,22 +230,22 @@ We download a CSV example dataset of articical sequences for this inference exam
 mkdir -p $WORKDIR/esm2_finetune_tutorial
 
 # download sample data CSV for inference
-DATA_PATH=$(download_bionemo_data esm2/testdata_esm2_infer:2.0 --source ngc)
-RESULTS_PATH=$WORKDIR/esm2_finetune_tutorial/inference_results.pt
+DATA_PATH=$(download_bionemo_data esm2/testdata_esm2_infer:2.0)
+RESULTS_PATH=$WORKDIR/esm2_finetune_tutorial/
 
 infer_esm2 --checkpoint-path <finetune checkpoint path> \
            --data-path $DATA_PATH \
            --results-path $RESULTS_PATH \
            --config-class ESM2FineTuneSeqConfig
 ```
 
-This will create a result `.pt` file under `$WORKDIR/esm2_finetune_tutorial/inference_results.pt` which can be loaded via PyTorch library in python environment:
+This will create a result `.pt` file under `$WORKDIR/esm2_finetune_tutorial/predictions__rank_0.pt` which can be loaded via PyTorch library in python environment:
 
 ```python
 import torch
 
-# Set the path to results file e.g. /workspace/bionemo2/esm2_finetune_tutorial/inference_results.pt
-# results_path = /workspace/bionemo2/esm2_finetune_tutorial/inference_results.pt
+# Set the path to results file e.g. /workspace/bionemo2/esm2_finetune_tutorial/predictions__rank_0.pt
+# results_path = /workspace/bionemo2/esm2_finetune_tutorial/predictions__rank_0.pt
 results = torch.load(results_path)
 
 # results is a python dict which includes the following result tensors for this example:
+1 −1		.gitlab/stages/00.pre.yml
+1 −0		examples/gpt3/gpt_config.yaml
+55 −50		examples/inference/README.md
+5 −5		examples/inference/gpt/gpt_batch_inference.py
+3 −3		examples/inference/t5/simple_t5_batch_inference.py
+2 −2		examples/multimodal/README.md
+137 −66		examples/multimodal/dataset_helpers.py
+0 −0		examples/multimodal/evaluation/evaluate_ai2d.py
+0 −0		examples/multimodal/evaluation/evaluate_chartqa.py
+0 −0		examples/multimodal/evaluation/evaluate_coco.py
+0 −0		examples/multimodal/evaluation/evaluate_mathvista.py
+6 −0		examples/multimodal/evaluation/evaluate_mmmu.py
+0 −0		examples/multimodal/evaluation/evaluate_ocrbench.py
+0 −0		examples/multimodal/evaluation/evaluate_textvqa.py
+0 −0		examples/multimodal/evaluation/evaluate_vqav2.py
+0 −0		examples/multimodal/evaluation/evaluation_datasets.py
+9 −2		examples/multimodal/nvlm/README.md
+1 −1		examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+1 −1		examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+1 −1		examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
+2 −2		examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
+1 −1		examples/multimodal/nvlm/sft_34b_internvit.sh
+1 −1		examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
+1 −6		examples/multimodal/pretrain_mistral_clip.sh
+1 −1		examples/multimodal/run_text_generation.py
+1 −6		examples/multimodal/sft_mistral_clip.sh
+4 −13		examples/multimodal/text_generation_mistral_clip.sh
+2 −2		examples/multimodal/train.py
+2 −1		megatron/core/dist_checkpointing/mapping.py
+0 −2		megatron/core/dist_checkpointing/serialization.py
+13 −14		megatron/core/dist_checkpointing/validation.py
+21 −6		megatron/core/distributed/distributed_data_parallel.py
+29 −18		megatron/core/extensions/transformer_engine.py
+4 −29		megatron/core/inference/common_inference_params.py
+15 −8		megatron/core/inference/engines/mcore_engine.py
+2 −2		megatron/core/inference/inference_request.py
+35 −0		megatron/core/inference/sampling_params.py
+3 −3		megatron/core/inference/scheduler.py
+4 −4		megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
+3 −398		megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
+400 −0		megatron/core/inference/text_generation_controllers/text_generation_controller.py
+48 −24		megatron/core/models/bert/bert_layer_specs.py
+16 −13		megatron/core/models/common/embeddings/rope_utils.py
+5 −2		megatron/core/models/multimodal/llava_model.py
+49 −24		megatron/core/optimizer/__init__.py
+21 −9		megatron/core/optimizer/clip_grads.py
+193 −102		megatron/core/optimizer/distrib_optimizer.py
+143 −86		megatron/core/optimizer/optimizer.py
+65 −0		megatron/core/optimizer/optimizer_config.py
+10 −0		megatron/core/pipeline_parallel/schedules.py
+80 −76		megatron/core/rerun_state_machine.py
+670 −212		megatron/core/transformer/cuda_graphs.py
+4 −1		megatron/core/transformer/moe/README.md
+115 −4		megatron/core/transformer/moe/moe_utils.py
+55 −26		megatron/core/transformer/moe/router.py
+4 −1		megatron/core/transformer/transformer_block.py
+32 −6		megatron/core/transformer/transformer_config.py
+4 −5		megatron/inference/text_generation/forward_step.py
+39 −8		megatron/training/arguments.py
+27 −12		megatron/training/checkpointing.py
+37 −13		megatron/training/training.py
+49 −11		megatron/training/utils.py
+4 −10		pretrain_vlm.py
+1 −0		...sts/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
+0 −1		...s/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+0 −1		tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+1 −0		tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
+1 −0		...imodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_dev.json
+1 −0		...imodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_lts.json
+57 −0		.../multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/model_config.yaml
+1 −0		...ava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_dev.json
+1 −0		...ava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_lts.json
+58 −0		...al-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/model_config.yaml
+1 −0		...ional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
+1 −0		tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
+1 −0		..._tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
+2 −0		tests/test_utils/recipes/multimodal-llava.yaml
+68 −0		tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
+33 −0		tests/unit_tests/dist_checkpointing/test_serialization.py
+6 −8		tests/unit_tests/inference/engines/test_mcore_engine.py
+3 −3		tests/unit_tests/inference/test_common_inference_params.py
+2 −2		tests/unit_tests/inference/test_scheduler.py
+2 −2		tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
+13 −13		tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
+47 −0		tests/unit_tests/test_optimizer.py
+44 −0		tests/unit_tests/transformer/moe/test_aux_loss.py
+59 −0		tests/unit_tests/transformer/moe/test_routers.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -93,4 +93,4 @@ sub-packages/bionemo-geneformer @jstjohn @malcolmgreaves @skothenhill-nv

		sub-packages/bionemo-scdl @jstjohn @malcolmgreaves @polinabinder1 @skothenhill-nv

		sub-packages/bionemo-noodles @skothenhill-nv @malcolmgreaves @jstjohn @edawson
		sub-packages/bionemo-noodles @skothenhill-nv @malcolmgreaves @jstjohn @edawson @cspades