Reintroduce dictionaries for data prefixes in GPT (NVIDIA#8362)

* Bugfix Signed-off-by: Jan Baczek <jbaczek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Undo "fixes" of the bot Signed-off-by: Jan Baczek <jbaczek@nvidia.com> --------- Signed-off-by: Jan Baczek <jbaczek@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
layalir · Feb 29, 2024 · 7179661 · 7179661
1 parent 78fff29
commit 7179661
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1200,7 +1200,6 @@ def build_train_valid_test_datasets(self):
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,
             "sequence_length": self.cfg.data.seq_length,
-            "blend": self.cfg.data.data_prefix,
             "split": self.cfg.data.splits_string,
             "path_to_cache": self.cfg.data.index_mapping_dir,
             "tokenizer": self.tokenizer,
@@ -1210,6 +1209,12 @@ def build_train_valid_test_datasets(self):
             "mock": mock_dataset,
         }
 
+        if isinstance(self.cfg.data.data_prefix, DictConfig):
+            _pref = self.cfg.data.data_prefix
+            kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']]
+        else:
+            kwargs['blend'] = self.cfg.data.data_prefix
+
         if self.cfg.data.get('add_fim', False):
             dataset_config = GPTFIMDatasetConfig(self.tokenizer, self.cfg.data.fim, **kwargs)