NVIDIA · polinabinder1 · Dec 17, 2024 · Dec 13, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
@@ -28,6 +28,20 @@
 __all__: Sequence[str] = ("RowFeatureIndex",)
 
 
+def are_dicts_equal(dict1, dict2):
+    """Compare two dictionaries with string keys and numpy.ndarray values.
+
+    Args:
+        dict1 (dict[str, np.ndarray]): The first dictionary to compare.
+        dict2 (dict[str, np.ndarray]): The second dictionary to compare.
+
+    Returns:
+        bool: True if the dictionaries have the same keys and all corresponding
+              numpy arrays are equal; False otherwise.
+    """
+    return dict1.keys() == dict2.keys() and all(np.array_equal(dict1[k], dict2[k]) for k in dict1)
+
+
 class RowFeatureIndex:
     """Maintains a mapping between a row and its features.
 
@@ -100,10 +114,16 @@ def append_features(
         if isinstance(features, pd.DataFrame):
             raise TypeError("Expected a dictionary, but received a Pandas DataFrame.")
         csum = max(self._cumulative_sum_index[-1], 0)
-        self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
-        self._feature_arr.append(features)
-        self._num_genes_per_row.append(num_genes)
-        self._labels.append(label)
+
+        # If the new feature array is identical to the last one, it is not appended. Instead, the last array accounts
+        # for the additional n_obs also.
+        if len(self._feature_arr) > 0 and are_dicts_equal(self._feature_arr[-1], features):
+            self._cumulative_sum_index[-1] = csum + n_obs
+        else:
+            self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
+            self._feature_arr.append(features)
+            self._num_genes_per_row.append(num_genes)
+            self._labels.append(label)
 
     def lookup(self, row: int, select_features: Optional[list[str]] = None) -> Tuple[list[np.ndarray], str]:
         """Find the features at a given row.

@@ -20,7 +20,53 @@
 import pandas as pd
 import pytest
 
-from bionemo.scdl.index.row_feature_index import RowFeatureIndex
+from bionemo.scdl.index.row_feature_index import RowFeatureIndex, are_dicts_equal
+
+
+@pytest.fixture
+def dict1():
+    return {"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])}
+
+
+@pytest.fixture
+def dict2():
+    return {"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])}
+
+
+@pytest.fixture
+def dict3():
+    return {"a": np.array([1, 2, 3]), "b": np.array([7, 8, 9])}
+
+
+@pytest.fixture
+def dict4():
+    return {"a": np.array([1, 2, 3]), "c": np.array([4, 5, 6])}
+
+
+@pytest.fixture
+def empty_dict():
+    return {}
+
+
+def test_equal_dicts(dict1, dict2):
+    assert are_dicts_equal(dict1, dict2) is True
+
+
+def test_unequal_values(dict1, dict3):
+    assert are_dicts_equal(dict1, dict3) is False
+
+
+def test_unequal_keys(dict1, dict4):
+    assert are_dicts_equal(dict1, dict4) is False
+
+
+def test_different_lengths(dict1):
+    smaller_dict = {"a": np.array([1, 2, 3])}
+    assert are_dicts_equal(dict1, smaller_dict) is False
+
+
+def test_empty_dicts(empty_dict):
+    assert are_dicts_equal(empty_dict, empty_dict) is True
 
 
 @pytest.fixture
@@ -37,6 +83,20 @@ def create_first_RowFeatureIndex() -> RowFeatureIndex:
     return index
 
 
+@pytest.fixture
+def create_same_features_first_RowFeatureIndex() -> RowFeatureIndex:
+    """
+    Instantiate a RowFeatureIndex.
+
+    Returns:
+        A RowFeatureIndex with known values.
+    """
+    one_feats = {"feature_name": np.array(["FF", "GG", "HH"]), "feature_int": np.array([1, 2, 3])}
+    index = RowFeatureIndex()
+    index.append_features(6, one_feats, len(one_feats["feature_name"]))
+    return index
+
+
 @pytest.fixture
 def create_second_RowFeatureIndex() -> RowFeatureIndex:
     """
@@ -86,14 +146,17 @@ def test_feature_index_internals_on_single_index(create_first_RowFeatureIndex):
     assert len(vals) == 1
 
 
-def test_feature_index_internals_on_append(create_first_RowFeatureIndex):
+def test_feature_index_internals_on_append_different_features(
+    create_first_RowFeatureIndex, create_second_RowFeatureIndex
+):
     one_feats = {"feature_name": np.array(["FF", "GG", "HH"]), "feature_int": np.array([1, 2, 3])}
     two_feats = {
         "feature_name": np.array(["FF", "GG", "HH", "II", "ZZ"]),
         "gene_name": np.array(["RET", "NTRK", "PPARG", "TSHR", "EGFR"]),
         "spare": np.array([None, None, None, None, None]),
     }
-    create_first_RowFeatureIndex.append_features(8, two_feats, len(two_feats["feature_name"]), "MY_DATAFRAME")
+    create_first_RowFeatureIndex.concat(create_second_RowFeatureIndex)
+    # append(8, two_feats, len(two_feats["feature_name"]), "MY_DATAFRAME")
     assert len(create_first_RowFeatureIndex) == 2
     assert create_first_RowFeatureIndex.number_vars_at_row(1) == 3
     assert create_first_RowFeatureIndex.number_vars_at_row(13) == 5
@@ -113,6 +176,28 @@ def test_feature_index_internals_on_append(create_first_RowFeatureIndex):
     assert label == "MY_DATAFRAME"
 
 
+def test_feature_index_internals_on_append_same_features(create_first_RowFeatureIndex):
+    one_feats = {"feature_name": np.array(["FF", "GG", "HH"]), "feature_int": np.array([1, 2, 3])}
+    create_first_RowFeatureIndex.concat(create_first_RowFeatureIndex)
+    # append(8, two_feats, len(two_feats["feature_name"]), "MY_DATAFRAME")
+    assert len(create_first_RowFeatureIndex) == 1
+    assert create_first_RowFeatureIndex.number_vars_at_row(1) == 3
+    assert create_first_RowFeatureIndex.number_vars_at_row(13) == 3
+    assert create_first_RowFeatureIndex.number_vars_at_row(19) == 3
+    assert create_first_RowFeatureIndex.number_vars_at_row(2) == 3
+    assert sum(create_first_RowFeatureIndex.number_of_values()) == 2 * (12 * 3)
+    assert create_first_RowFeatureIndex.number_of_values()[0] == 2 * (12 * 3)
+    assert create_first_RowFeatureIndex.number_of_rows() == 24
+    feats, label = create_first_RowFeatureIndex.lookup(row=3, select_features=None)
+    assert np.all(feats[0] == one_feats["feature_name"])
+    assert np.all(feats[1] == one_feats["feature_int"])
+    assert label is None
+    feats, label = create_first_RowFeatureIndex.lookup(row=15, select_features=None)
+    assert np.all(feats[0] == one_feats["feature_name"])
+    assert np.all(feats[1] == one_feats["feature_int"])
+    assert label is None
+
+
 def test_concat_length(
     create_first_RowFeatureIndex,
     create_second_RowFeatureIndex,