From 7311aa2cd75463177756a1615d0539118d375586 Mon Sep 17 00:00:00 2001 From: "Peter St. John" Date: Tue, 21 Jan 2025 10:04:27 -0700 Subject: [PATCH] hotfix for some failing python tests due to NGC files being moved around (#626) Fixes failing tests due to NGC resource files being moved around, and renames some of the ESM-2 resources to better delineate checkpoints we trained on our internal data from those converted from the original ESM-2 model weights. --------- Signed-off-by: Peter St. John --- docs/docs/models/ESM-2/pre-training.md | 2 +- .../examples/bionemo-esm2/finetune.ipynb | 4 ++-- .../src/bionemo/core/data/resources/esm2.yaml | 21 +++++++++++++------ .../model/finetune/test_finetune_regressor.py | 6 +----- .../test_finetune_token_classifier.py | 5 ----- .../esm2/scripts/test_finetune_esm2.py | 8 ++----- .../bionemo/esm2/scripts/test_infer_esm2.py | 2 +- 7 files changed, 22 insertions(+), 26 deletions(-) diff --git a/docs/docs/models/ESM-2/pre-training.md b/docs/docs/models/ESM-2/pre-training.md index 418fd798a..0f1232ff4 100644 --- a/docs/docs/models/ESM-2/pre-training.md +++ b/docs/docs/models/ESM-2/pre-training.md @@ -22,7 +22,7 @@ Validation perplexity evaluated on the NVIDIA validation set. === "8M" ```python - esm2_8m_ckpt_path = load("esm2/nv_8m:2.0") + esm2_8m_ckpt_path = load("esm2/8m:2.0") ``` ### Training Script diff --git a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb index aa26cd72f..ef1088b9f 100644 --- a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb +++ b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb @@ -376,7 +376,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The following code will download the internally pre-trained model, `esm2/nv_8m:2.0`, from the NGC registry. Please refer to [ESM-2 Model Overview](../../../models/ESM-2/index.md) for a list of available checkpoints." + "The following code will download the internally pre-trained model, `esm2/8m:2.0`, from the NGC registry. Please refer to [ESM-2 Model Overview](../../../models/ESM-2/index.md) for a list of available checkpoints." ] }, { @@ -395,7 +395,7 @@ "source": [ "from bionemo.core.data.load import load\n", "\n", - "checkpoint_path = load(\"esm2/nv_8m:2.0\")\n", + "checkpoint_path = load(\"esm2/8m:2.0\")\n", "print(checkpoint_path)" ] }, diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml index d93139d75..d7749aa78 100644 --- a/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml +++ b/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml @@ -25,14 +25,23 @@ description: > An ESM-2 650M model pre-trained on NVIDIA's train/test data split. -- tag: nv_8m:2.0 +# - tag: nv_8m:2.1 +# ngc: "nvidia/clara/esm2nv8m:2.1" +# ngc_registry: model +# pbss: "s3://general-purpose/esm2/checkpoints/8m/esm2_8m_checkpoint.tar.gz" +# sha256: b4ea4d52eea8a25d2c2838617ff678f0da22d384cee195b0c192686816078dcd # pragma: allowlist secret +# owner: Peter St John +# description: > +# An ESM-2 8M model pre-trained on NVIDIA's train/test data split. + +- tag: 8m:2.0 ngc: "nvidia/clara/esm2nv8m:2.0" ngc_registry: model - pbss: "s3://general-purpose/esm2/checkpoints/8m/esm2_8m_checkpoint.tar.gz" - sha256: b4ea4d52eea8a25d2c2838617ff678f0da22d384cee195b0c192686816078dcd # pragma: allowlist secret + pbss: "s3://general-purpose/esm2/checkpoints/converted/8m/esm2_hf_converted_8m_checkpoint.tar.gz" + sha256: 2957b2c36d5978d0f595d6f1b72104b312621cf0329209086537b613c1c96d16 # pragma: allowlist secret owner: Peter St John description: > - An ESM-2 8M model pre-trained on NVIDIA's train/test data split. + The original 8M parameter ESM2 model weights converted to the NeMo2 checkpoint format. - tag: 650m:2.0 ngc: nvidia/clara/esm2nv650m:2.0 @@ -41,7 +50,7 @@ sha256: 0798767e843e3d54315aef91934d28ae7d8e93c2849d5fcfbdf5fac242013997 # pragma: allowlist secret owner: Farhad Ramezanghorbani description: > - A pretrained 650M parameter ESM2 model. See https://ngc.nvidia.com/catalog/models/nvidia:clara:esm2nv650m. + The original 650M parameter ESM2 model weights converted to the NeMo2 checkpoint format. - tag: 3b:2.0 ngc: nvidia/clara/esm2nv3b:2.0 @@ -50,7 +59,7 @@ sha256: a2248cfed1ef39f83bd32a0e08b84c0a8f39325d383e2d92767022ff7f5260ed # pragma: allowlist secret owner: Farhad Ramezanghorbani description: > - A pretrained 3B parameter ESM2 model. See https://ngc.nvidia.com/catalog/models/nvidia:clara:esm2nv3b. + The original 3B parameter ESM2 model c converted to the NeMo2 checkpoint format. - tag: fulldata_esm2_pretrain:2.0 ngc: nvidia/clara/esm2_pretrain_nemo2_data:1.0 diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py index f9ad25851..9e6751787 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py @@ -26,13 +26,9 @@ from bionemo.testing import megatron_parallel_state_utils -# To download a 8M internally pre-trained ESM2 model -pretrain_ckpt_path = load("esm2/nv_8m:2.0") - - @pytest.fixture def config(): - return ESM2FineTuneSeqConfig(encoder_frozen=True, ft_dropout=0.50, initial_ckpt_path=str(pretrain_ckpt_path)) + return ESM2FineTuneSeqConfig(encoder_frozen=True, ft_dropout=0.50, initial_ckpt_path=str(load("esm2/8m:2.0"))) @pytest.fixture diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py index fe4043831..3662074cd 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py @@ -16,7 +16,6 @@ import pytest -from bionemo.core.data.load import load from bionemo.esm2.data import tokenizer from bionemo.esm2.model.finetune.finetune_token_classifier import ( ESM2FineTuneTokenConfig, @@ -26,10 +25,6 @@ from bionemo.testing import megatron_parallel_state_utils -# To download a 8M internally pre-trained ESM2 model -pretrain_ckpt_path = load("esm2/nv_8m:2.0") - - @pytest.fixture def config(): return ESM2FineTuneTokenConfig(encoder_frozen=True, cnn_dropout=0.1, cnn_hidden_dim=32, cnn_num_classes=5) diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py index bc929a1ab..c51af07d5 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py @@ -30,10 +30,6 @@ from bionemo.testing.callbacks import MetricTracker -# To download a 8M internally pre-trained ESM2 model -pretrain_ckpt_path = load("esm2/nv_8m:2.0") - - def data_to_csv(data, tmp_path): """Create a mock protein dataset.""" csv_file = tmp_path / "protein_dataset.csv" @@ -57,7 +53,7 @@ def test_esm2_finetune_token_classifier( train_data_path=data_to_csv(dummy_data_per_token_classification_ft, tmp_path), valid_data_path=data_to_csv(dummy_data_per_token_classification_ft, tmp_path), experiment_name="finetune_new_head_token_classification", - restore_from_checkpoint_path=str(pretrain_ckpt_path), + restore_from_checkpoint_path=str(load("esm2/8m:2.0")), num_steps=n_steps_train, num_nodes=1, devices=1, @@ -102,7 +98,7 @@ def test_esm2_finetune_regressor( train_data_path=data_to_csv(dummy_data_single_value_regression_ft, tmp_path), valid_data_path=data_to_csv(dummy_data_single_value_regression_ft, tmp_path), experiment_name="finetune_new_head_regression", - restore_from_checkpoint_path=str(pretrain_ckpt_path), + restore_from_checkpoint_path=str(load("esm2/8m:2.0")), num_steps=n_steps_train, num_nodes=1, devices=1, diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py index 9575fafa9..b3c349b8f 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py @@ -69,7 +69,7 @@ def test_infer_runs( infer_model( data_path=data_path, - checkpoint_path=load("esm2/nv_8m:2.0"), + checkpoint_path=load("esm2/8m:2.0"), results_path=result_dir, min_seq_length=min_seq_len, prediction_interval=prediction_interval,