From 7311aa2cd75463177756a1615d0539118d375586 Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Tue, 21 Jan 2025 10:04:27 -0700
Subject: [PATCH] hotfix for some failing python tests due to NGC files being
 moved around (#626)

Fixes failing tests due to NGC resource files being moved around, and
renames some of the ESM-2 resources to better delineate checkpoints we
trained on our internal data from those converted from the original
ESM-2 model weights.

---------

Signed-off-by: Peter St. John <pstjohn@nvidia.com>
---
 docs/docs/models/ESM-2/pre-training.md        |  2 +-
 .../examples/bionemo-esm2/finetune.ipynb      |  4 ++--
 .../src/bionemo/core/data/resources/esm2.yaml | 21 +++++++++++++------
 .../model/finetune/test_finetune_regressor.py |  6 +-----
 .../test_finetune_token_classifier.py         |  5 -----
 .../esm2/scripts/test_finetune_esm2.py        |  8 ++-----
 .../bionemo/esm2/scripts/test_infer_esm2.py   |  2 +-
 7 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/docs/docs/models/ESM-2/pre-training.md b/docs/docs/models/ESM-2/pre-training.md
index 418fd798a..0f1232ff4 100644
--- a/docs/docs/models/ESM-2/pre-training.md
+++ b/docs/docs/models/ESM-2/pre-training.md
@@ -22,7 +22,7 @@ Validation perplexity evaluated on the NVIDIA validation set.
 === "8M"
 
     ```python
-    esm2_8m_ckpt_path = load("esm2/nv_8m:2.0")
+    esm2_8m_ckpt_path = load("esm2/8m:2.0")
     ```
 
     ### Training Script
diff --git a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb
index aa26cd72f..ef1088b9f 100644
--- a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb
+++ b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb
@@ -376,7 +376,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The following code will download the internally pre-trained model, `esm2/nv_8m:2.0`, from the NGC registry. Please refer to [ESM-2 Model Overview](../../../models/ESM-2/index.md) for a list of available checkpoints."
+    "The following code will download the internally pre-trained model, `esm2/8m:2.0`, from the NGC registry. Please refer to [ESM-2 Model Overview](../../../models/ESM-2/index.md) for a list of available checkpoints."
    ]
   },
   {
@@ -395,7 +395,7 @@
    "source": [
     "from bionemo.core.data.load import load\n",
     "\n",
-    "checkpoint_path = load(\"esm2/nv_8m:2.0\")\n",
+    "checkpoint_path = load(\"esm2/8m:2.0\")\n",
     "print(checkpoint_path)"
    ]
   },
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml
index d93139d75..d7749aa78 100644
--- a/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml
+++ b/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml
@@ -25,14 +25,23 @@
   description: >
     An ESM-2 650M model pre-trained on NVIDIA's train/test data split.
 
-- tag: nv_8m:2.0
+# - tag: nv_8m:2.1
+#   ngc: "nvidia/clara/esm2nv8m:2.1"
+#   ngc_registry: model
+#   pbss: "s3://general-purpose/esm2/checkpoints/8m/esm2_8m_checkpoint.tar.gz"
+#   sha256: b4ea4d52eea8a25d2c2838617ff678f0da22d384cee195b0c192686816078dcd # pragma: allowlist secret
+#   owner: Peter St John <pstjohn@nvidia.com>
+#   description: >
+#     An ESM-2 8M model pre-trained on NVIDIA's train/test data split.
+
+- tag: 8m:2.0
   ngc: "nvidia/clara/esm2nv8m:2.0"
   ngc_registry: model
-  pbss: "s3://general-purpose/esm2/checkpoints/8m/esm2_8m_checkpoint.tar.gz"
-  sha256: b4ea4d52eea8a25d2c2838617ff678f0da22d384cee195b0c192686816078dcd # pragma: allowlist secret
+  pbss: "s3://general-purpose/esm2/checkpoints/converted/8m/esm2_hf_converted_8m_checkpoint.tar.gz"
+  sha256: 2957b2c36d5978d0f595d6f1b72104b312621cf0329209086537b613c1c96d16 # pragma: allowlist secret
   owner: Peter St John <pstjohn@nvidia.com>
   description: >
-    An ESM-2 8M model pre-trained on NVIDIA's train/test data split.
+    The original 8M parameter ESM2 model weights converted to the NeMo2 checkpoint format.
 
 - tag: 650m:2.0
   ngc: nvidia/clara/esm2nv650m:2.0
@@ -41,7 +50,7 @@
   sha256: 0798767e843e3d54315aef91934d28ae7d8e93c2849d5fcfbdf5fac242013997 # pragma: allowlist secret
   owner: Farhad Ramezanghorbani <farhadr@nvidia.com>
   description: >
-    A pretrained 650M parameter ESM2 model. See https://ngc.nvidia.com/catalog/models/nvidia:clara:esm2nv650m.
+    The original 650M parameter ESM2 model weights converted to the NeMo2 checkpoint format.
 
 - tag: 3b:2.0
   ngc: nvidia/clara/esm2nv3b:2.0
@@ -50,7 +59,7 @@
   sha256: a2248cfed1ef39f83bd32a0e08b84c0a8f39325d383e2d92767022ff7f5260ed # pragma: allowlist secret
   owner: Farhad Ramezanghorbani <farhadr@nvidia.com>
   description: >
-    A pretrained 3B parameter ESM2 model. See https://ngc.nvidia.com/catalog/models/nvidia:clara:esm2nv3b.
+    The original 3B parameter ESM2 model c converted to the NeMo2 checkpoint format.
 
 - tag: fulldata_esm2_pretrain:2.0
   ngc: nvidia/clara/esm2_pretrain_nemo2_data:1.0
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py
index f9ad25851..9e6751787 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_regressor.py
@@ -26,13 +26,9 @@
 from bionemo.testing import megatron_parallel_state_utils
 
 
-# To download a 8M internally pre-trained ESM2 model
-pretrain_ckpt_path = load("esm2/nv_8m:2.0")
-
-
 @pytest.fixture
 def config():
-    return ESM2FineTuneSeqConfig(encoder_frozen=True, ft_dropout=0.50, initial_ckpt_path=str(pretrain_ckpt_path))
+    return ESM2FineTuneSeqConfig(encoder_frozen=True, ft_dropout=0.50, initial_ckpt_path=str(load("esm2/8m:2.0")))
 
 
 @pytest.fixture
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py
index fe4043831..3662074cd 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune_token_classifier.py
@@ -16,7 +16,6 @@
 
 import pytest
 
-from bionemo.core.data.load import load
 from bionemo.esm2.data import tokenizer
 from bionemo.esm2.model.finetune.finetune_token_classifier import (
     ESM2FineTuneTokenConfig,
@@ -26,10 +25,6 @@
 from bionemo.testing import megatron_parallel_state_utils
 
 
-# To download a 8M internally pre-trained ESM2 model
-pretrain_ckpt_path = load("esm2/nv_8m:2.0")
-
-
 @pytest.fixture
 def config():
     return ESM2FineTuneTokenConfig(encoder_frozen=True, cnn_dropout=0.1, cnn_hidden_dim=32, cnn_num_classes=5)
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py
index bc929a1ab..c51af07d5 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py
@@ -30,10 +30,6 @@
 from bionemo.testing.callbacks import MetricTracker
 
 
-# To download a 8M internally pre-trained ESM2 model
-pretrain_ckpt_path = load("esm2/nv_8m:2.0")
-
-
 def data_to_csv(data, tmp_path):
     """Create a mock protein dataset."""
     csv_file = tmp_path / "protein_dataset.csv"
@@ -57,7 +53,7 @@ def test_esm2_finetune_token_classifier(
             train_data_path=data_to_csv(dummy_data_per_token_classification_ft, tmp_path),
             valid_data_path=data_to_csv(dummy_data_per_token_classification_ft, tmp_path),
             experiment_name="finetune_new_head_token_classification",
-            restore_from_checkpoint_path=str(pretrain_ckpt_path),
+            restore_from_checkpoint_path=str(load("esm2/8m:2.0")),
             num_steps=n_steps_train,
             num_nodes=1,
             devices=1,
@@ -102,7 +98,7 @@ def test_esm2_finetune_regressor(
             train_data_path=data_to_csv(dummy_data_single_value_regression_ft, tmp_path),
             valid_data_path=data_to_csv(dummy_data_single_value_regression_ft, tmp_path),
             experiment_name="finetune_new_head_regression",
-            restore_from_checkpoint_path=str(pretrain_ckpt_path),
+            restore_from_checkpoint_path=str(load("esm2/8m:2.0")),
             num_steps=n_steps_train,
             num_nodes=1,
             devices=1,
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py
index 9575fafa9..b3c349b8f 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_infer_esm2.py
@@ -69,7 +69,7 @@ def test_infer_runs(
 
     infer_model(
         data_path=data_path,
-        checkpoint_path=load("esm2/nv_8m:2.0"),
+        checkpoint_path=load("esm2/8m:2.0"),
         results_path=result_dir,
         min_seq_length=min_seq_len,
         prediction_interval=prediction_interval,