Add ability to give seperate datasets for test, train and validation (N…

…VIDIA#4798) * Add ability to give seperate datasets for test, train and validation * Addressed Sandeeps comments * Addressed Sandeeps comments * Add ability to give seperate datasets for test, train and validation * Add ability to give seperate datasets for test, train and validation * Addressed review comments * Bug fix for common dataset utils * Add CI tests Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com> * Reformat code Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com> * Bug fix Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com> * Bug fix * Bug Fix * Bug Fix * Update Jenkinsfile * Addressed comments * Addressed Eriks comments. * Addressed Sandeep * Update Jenkinsfile * Update Jenkinsfile * Update dataset_utils.py * Update Jenkinsfile * Update Jenkinsfile * Use GPT CI config Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca> Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com> Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca> Co-authored-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca> Signed-off-by: Hainan Xu <hainanx@nvidia.com>
hainan-xv · Nov 29, 2022 · 3f041ba · 3f041ba
1 parent c994e85
commit 3f041ba
Show file tree

Hide file tree

Showing 6 changed files with 561 additions and 232 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -3604,10 +3604,10 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
+        trainer.val_check_interval=2 \
         trainer.limit_val_batches=2 \
         trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
+        trainer.max_steps=3 \
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
         exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
@@ -3627,15 +3627,15 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.decoder.bias_activation_fusion=False \
         model.decoder.activations_checkpoint_method='block' \
         model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
+        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'"
         sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=1 \
         trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
+        trainer.max_steps=6 \
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
         exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
@@ -3656,7 +3656,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.decoder.bias_activation_fusion=False \
         model.decoder.activations_checkpoint_method='block' \
         model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
+        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'"
         sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results"
       }
     }

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -118,14 +118,18 @@ model:
   sequence_parallel: False
 
   data:
-    # Path to data must be specified by the user.
-    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
     # Or see example below: 
     # data_prefix: 
     #   - .5
     #   - /raid/data/pile/my-gpt3_00_text_document
     #   - .5
     #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
     data_prefix: ???
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -92,14 +92,18 @@ model:
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
 
   data:
-    # Path to data must be specified by the user.
-    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
-    # Or see example below:
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
+    # Or see example below: 
     # data_prefix: 
     #   - .5
     #   - /raid/data/pile/my-t5_00_text_document
     #   - .5
     #   - /raid/data/pile/my-t5_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
     data_prefix: ???
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap # mmap, retmmap, text_mmap, csv_mmap

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/base_dataset_utils.py b/nemo/collections/nlp/data/language_modeling/megatron/base_dataset_utils.py
@@ -15,7 +15,7 @@
 import math
 
 
-def get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples):
+def get_datasets_weights_and_num_samples(data_prefix, num_samples):
 
     # The data prefix should be in the format of:
     #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
@@ -39,9 +39,11 @@ def get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_sampl
     # TODO: check data leakage between train/val/test?
     datasets_train_valid_test_num_samples = []
     for weight in weights:
-        datasets_train_valid_test_num_samples.append(
-            [int(math.ceil(val * weight * 1.005)) for val in train_valid_test_num_samples]
-        )
+        # Comes here when we have seperate train,test and validation datasets.
+        if isinstance(num_samples, int):
+            datasets_train_valid_test_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
+        else:
+            datasets_train_valid_test_num_samples.append([int(math.ceil(val * weight * 1.005)) for val in num_samples])
 
     return prefixes, weights, datasets_train_valid_test_num_samples