Merge branch 'main' of https://github.com/NVIDIA/NeMo into casual_con…

…former_lookahead_newdesign
NVIDIA · Aug 2, 2022 · 0bde720 · 0bde720
2 parents 7589f88 + 5c8fe3a
commit 0bde720
Show file tree

Hide file tree

Showing 25 changed files with 638 additions and 520 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
diff --git a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml
@@ -1,3 +1,7 @@
+defaults:
+  - .@model.encoder: megatron_model_base_config
+  - .@model.decoder: megatron_model_base_config
+
 name: megatron_bart
 restore_from_path: null # used when starting from a .nemo file
 
@@ -9,7 +13,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. 
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
@@ -22,7 +26,7 @@ trainer:
 exp_manager:
   explicit_log_dir: null
   exp_dir: null
-  name: megatron_bart
+  name: ${name}
   create_wandb_logger: False
   wandb_logger_kwargs:
     project: null
@@ -35,56 +39,28 @@ exp_manager:
     save_top_k: 10
     mode: min
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
-    filename: 'megatron_bart--{val_loss:.2f}-{step}-{consumed_samples}'
+    filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}'
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 model:
   # model parallelism 
-  global_batch_size: 8
   micro_batch_size: 4
+  global_batch_size: 8 # will use more micro batches to reach global batch size
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.
 
   # model architecture
   make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
 
   megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
-  hidden_size: 768
-  ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
-  num_attention_heads: 12
-  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
-  attention_dropout: 0.1 # Dropout probability in the attention layer.
-  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative']
-  relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
-  relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
-  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
-  layernorm_epsilon: 1e-5
-  persist_layer_norm: True # Use of persistent fused layer norm kernel.
-  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
-  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
-  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
-  bias: True # Whether to use bias terms in all weight matrices.
-  normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
-  decoder_arch: 'transformer' # Options: ['transformer']
-  activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
-  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
-  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
-  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
-  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
-  share_token_embeddings: True # If True share encoder/decoder embeddings
-  share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits
 
   tokenizer:
     library: 'megatron'
@@ -93,23 +69,28 @@ model:
     vocab_file: null
     merge_file: null
     num_sentinel_tokens: 0 # expected to be 0 for BART
+    sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # weight init
+  embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+
+  # embedding dropout
+  embedding_dropout: 0.1
+
+  # embedding sharing
+  share_token_embeddings: True # If True share encoder/decoder embeddings
+  share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits
 
   # precision
   native_amp_init_scale: 4294967296 # 2 ** 32
   native_amp_growth_interval: 1000
-  fp32_residual_connection: False # Move residual connections to fp32
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # miscellaneous
   seed: 1234
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
 
-  # not implemented in NeMo yet
-  activations_checkpoint_method: null # 'uniform', 'block'
-  activations_checkpoint_num_layers: 1 
-
   data:
     # Path to data must be specified by the user.
     # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-bart_00_text_document,.5,/raid/data/pile/my-bart_01_text_document]",

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -73,6 +73,7 @@ model:
     vocab_file: null
     merge_file: null 
     delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
 
   # precision
   native_amp_init_scale: 4294967296 # 2 ** 32

diff --git a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml
@@ -0,0 +1,32 @@
+num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
+hidden_size: 768
+ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
+num_attention_heads: 12
+init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
+attention_dropout: 0.1 # Dropout probability in the attention layer.
+position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative']
+relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
+relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
+relative_position_bias_self_attention_only: True # whether to only use relative position bias for self attention only.
+kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+layernorm_epsilon: 1e-5
+persist_layer_norm: True # Use of persistent fused layer norm kernel.
+bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+bias: True # Whether to use bias terms in all weight matrices.
+normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+arch: 'transformer' # Options: ['transformer', 'perceiver']
+activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
+headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
+openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+fp32_residual_connection: False # Use FP32 for residual connections.
+activations_checkpoint_method: null # 'uniform', 'block'
+activations_checkpoint_num_layers: 1 
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -1,3 +1,7 @@
+defaults:
+  - .@model.encoder: megatron_model_base_config
+  - .@model.decoder: megatron_model_base_config
+
 name: megatron_t5
 restore_from_path: null # used when starting from a .nemo file
 
@@ -22,7 +26,7 @@ trainer:
 exp_manager:
   explicit_log_dir: null
   exp_dir: null
-  name: megatron_t5
+  name: ${name}
   create_wandb_logger: False
   wandb_logger_kwargs:
     project: null
@@ -35,7 +39,7 @@ exp_manager:
     save_top_k: 10
     mode: min
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
-    filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}'
+    filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}'
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 model:
@@ -49,45 +53,14 @@ model:
 
   # model architecture
   make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
 
   megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
   grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
-  hidden_size: 768
-  ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
-  num_attention_heads: 12
-  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
-  attention_dropout: 0.1 # Dropout probability in the attention layer.
-  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative']
-  relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
-  relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
-  relative_position_bias_self_attention_only: True # Whether to only use relative position bias for self attention only.
-  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
-  layernorm_epsilon: 1e-5
-  persist_layer_norm: True # Use of persistent fused layer norm kernel.
-  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
-  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
-  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
-  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
-  bias: True # Whether to use bias terms in all weight matrices.
-  normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
-  decoder_arch: 'transformer' # Options: ['transformer']
-  activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
-  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
-  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
-  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
-  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
-  share_token_embeddings: True # If True share encoder/decoder embeddings
-  share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits
 
   tokenizer:
     library: 'megatron'
@@ -96,23 +69,28 @@ model:
     vocab_file: null
     merge_file: null
     num_sentinel_tokens: 100
+    sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # weight init
+  embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+
+  # embedding dropout
+  embedding_dropout: 0.1
+
+  # embedding sharing
+  share_token_embeddings: True # If True share encoder/decoder embeddings
+  share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits
 
   # precision
   native_amp_init_scale: 4294967296 # 2 ** 32
   native_amp_growth_interval: 1000
-  fp32_residual_connection: False # Move residual connections to fp32
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # miscellaneous
   seed: 1234
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
 
-  # not implemented in NeMo yet
-  activations_checkpoint_method: null # 'uniform', 'block'
-  activations_checkpoint_num_layers: 1 
-
   data:
     # Path to data must be specified by the user.
     # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",