NVIDIA · polinabinder1 · Dec 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
@@ -2,7 +2,6 @@
 docs/site/
 *.nemo
 protein/
-singlecell/
 results/
 
 # Local configs

@@ -279,10 +279,10 @@ type, and then pass in the config type to the training recipe.
 Similar to ESM-2, you can download the dataset and checkpoint through our utility function.
 
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 GENEFORMER_10M_CKPT=$(download_bionemo_data geneformer/10M_240530:2.0 --source $MY_DATA_SOURCE); \
 train_geneformer     \
-    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data    \
+    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl    \
     --result-dir ./results     \
     --restore-from-checkpoint-path ${GENEFORMER_10M_CKPT} \
     --experiment-name test_experiment     \
@@ -305,9 +305,9 @@ copy the `sub-projects/bionemo-geneformer/geneformer/scripts/train_geneformer.py
 Simple fine-tuning example (**NOTE**: please change `--restore-from-checkpoint-path` to be the checkpoint directory path that was output last
 by the previous train run)
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 train_geneformer     \
-    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data    \
+    --data-dir ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl    \
     --result-dir ./results     \
     --experiment-name test_finettune_experiment     \
     --num-gpus 1  \
@@ -331,11 +331,11 @@ customizations for your task.
 
 
 ```bash
-TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20241203 --source $MY_DATA_SOURCE); \
 bionemo-geneformer-recipe \
-    --recipe geneformer_10m_pretrain_recipe \
-    --dest my_config.yaml \
-    --data-path ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data \
+    --recipe 10m-pretrain \
+    --dest my_config.json \
+    --data-path ${TEST_DATA_DIR}/cellxgene_2023-12-15_small_processed_scdl \
     --result-dir ./results
 ```
 > ⚠️ **IMPORTANT:** Inspect and edit the contents of the outputted my_config.yaml as you see fit

@@ -5,3 +5,11 @@
   sha256: 7a4237537bf535dfa00301ce8cc7073e0a23d5bc8aa902ad65db9f51b57a6df9 # pragma: allowlist secret
   owner: Polina Binder <pbinder@nvidia.com>
   description: Sample test data for SCDL.
+
+- tag: sample_scdl_feature_ids
+  ngc: nvidia/clara/scdl_sample_test_feature_ids:1.0
+  ngc_registry: resource
+  pbss: s3://bionemo-ci/test-data/scdl_sample_test_feat_ids.tar.gz
+  sha256: 9020ba336dbfe33bddadba26ca0cde49958cbd73c5ad44f0960a5a4837c9db26 # pragma: allowlist secret
+  owner: Savitha Srinivasan <savithas@nvidia.com>
+  description: Sample test data for SCDL with feature IDs appended.
@@ -21,3 +21,11 @@
   sha256: ab038b184de52e53ff7bcea5e01d97d55944c507db88c0495bdf9e5e9e0303a4 # pragma: allowlist secret
   owner: John St John <jstjohn@nvidia.com>
   description: Golden values for geneformer QA model.
+
+- tag: testdata-20241203
+  ngc: nvidia/clara/singlecell-testdata:2.0
+  ngc_registry: resource
+  pbss: "s3://bionemo-ci/test-data/singlecell/singlecell-scdltestdata-20241203.tar.gz"
+  sha256: d8e3ea569bc43768c24aa651aff77722df202078415528497c22394046b08cc3 # pragma: allowlist secret
+  owner:  Savitha Srinivasan <savithas@nvidia.com>
+  description: Test data for single cell models in SCDL Memmap format.
@@ -16,7 +16,7 @@ pytest -v .
 
 
 ## Acquiring Data
-Datasets are expected to be in the form of AnnData (.h5ad) objects such as those downloaded from [Cell x Gene | CZI](https://chanzuckerberg.github.io/cellxgene-census/). They are then pre-processed with either `bionemo-geneformer/src/bionemo/geneformer/data/singlecell/sc_memmap.py` or with sc-DL.
+Datasets are expected to be in the form of AnnData (.h5ad) objects such as those downloaded from [Cell x Gene | CZI](https://chanzuckerberg.github.io/cellxgene-census/). They are then pre-processed with `sub-packages/bionemo-scdl/src/bionemo/scdl/scripts/convert_h5ad_to_scdl.py`.
 
 ## Geneformer-nv 10M and 106M
 Refer to the Dataset cards and Model cards to learn more about the pre-trained checkpoints provided for both 10M and 106M of Geneformer-nv.

@@ -21,7 +21,6 @@ dependencies = [
 [project.scripts]
 bionemo-geneformer-train= "bionemo.geneformer.run.main:main"
 bionemo-geneformer-recipe= "bionemo.geneformer.run.recipes:main"
-sc_memmap = "bionemo.geneformer.scripts.sc_memmap:main_cli"
 infer_geneformer = "bionemo.geneformer.scripts.infer_geneformer:geneformer_infer_entrypoint"
 train_geneformer = "bionemo.geneformer.scripts.train_geneformer:entrypoint"
 geneformer_mlm_loss_eval = "bionemo.geneformer.scripts.geneformer_mlm_loss_eval:entrypoint"

@@ -128,6 +128,7 @@ def main(
     seq_len_nv: int = 2048,
     seq_len_hf: int = 2048,
     seed: int = 513,
+    include_unrecognized_vocab_in_dataset: bool = False,
 ):
     """Inference function (requires DDP and only training data that fits in memory)."""
     # This is just used to get the tokenizer :(
@@ -185,6 +186,7 @@ def main(
             max_len=seq_len_nv,
             mask_prob=mask_prob,
             seed=seed,
+            include_unrecognized_vocab_in_dataset=include_unrecognized_vocab_in_dataset,
         )
         ds_hf_nvfilt = SingleCellDataset(
             dataset_path,
@@ -194,6 +196,7 @@ def main(
             mask_prob=mask_prob,
             eos_token=hf_tokenizer.token_to_id(hf_tokenizer.sep_token),  # Stored in the special token
             seed=seed,
+            include_unrecognized_vocab_in_dataset=include_unrecognized_vocab_in_dataset,
         )
         print(f"Loaded dataset of length (NV): {len(ds_nv)}, (HF): {len(ds_hf_nvfilt)}")
 
@@ -299,6 +302,11 @@ def entrypoint():
     )
     parser.add_argument("--hf-model-path", type=str, default="ctheodoris/Geneformer", help="HF model path")
     parser.add_argument("--dataset-path", type=Path, help="Path to dataset directory", required=True)
+    parser.add_argument(
+        "--include-unrecognized-vocab-in-dataset",
+        action="store_true",
+        help="If set to true, a hard-check is performed to verify all gene identifers are in the user supplied tokenizer vocab. Defaults to false which means any gene identifier not in the user supplied tokenizer vocab will be excluded.",
+    )
 
     args = parser.parse_args()
     main(
@@ -307,6 +315,7 @@ def entrypoint():
         args.dataset_path,
         args.hf_token_dictionary_path,
         args.hf_medians_dictionary_path,
+        args.include_unrecognized_vocab_in_dataset,
     )
 
 

@@ -51,6 +51,7 @@ class SingleCellDataModule(MegatronDataModule):
         num_mask_per_sample (int): Number of masked versions of a single sample to be returned by each worker
         train_batch_size (int): Batch size for training
         val_batch_size (int): Batch size for validation
+        include_unrecognized_vocab_in_dataset (bool, optional): If set to True, a hard-check is performed to verify all gene identifers are in the user supplied tokenizer vocab. Defaults to False which means any gene identifier not in the user supplied tokenizer vocab will be excluded.
 
     Attributes:
         cfg (Config): Configuration object
@@ -82,6 +83,7 @@ def __init__(  # noqa: D107
         num_workers: int = 10,  # TODO can this be automatically set?
         persistent_workers: bool = True,
         pin_memory: bool = True,
+        include_unrecognized_vocab_in_dataset: bool = False,
     ) -> None:
         super().__init__()
         if predict_dataset_path is None:
@@ -122,6 +124,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                include_unrecognized_vocab_in_dataset=include_unrecognized_vocab_in_dataset,
             )
             self._val_dataset_ori = SingleCellDataset(
                 self.data_path_val,
@@ -132,6 +135,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                include_unrecognized_vocab_in_dataset=include_unrecognized_vocab_in_dataset,
             )
             self._test_dataset_ori = SingleCellDataset(
                 self.data_path_test,
@@ -142,6 +146,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                include_unrecognized_vocab_in_dataset=include_unrecognized_vocab_in_dataset,
             )
             self._predict_dataset_ori = None
         else:
@@ -155,6 +160,7 @@ def __init__(  # noqa: D107
                 mask_token_prob=self.mask_token_prob,
                 random_token_prob=self.random_token_prob,
                 seed=random_utils.get_seed_from_rng(rng),
+                include_unrecognized_vocab_in_dataset=include_unrecognized_vocab_in_dataset,
             )
             self._train_dataset_ori = None
             self._val_dataset_ori = None