Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/NVIDIA/NeMo into casual_con…
Browse files Browse the repository at this point in the history
…former_lookahead_newdesign
  • Loading branch information
VahidooX committed Aug 2, 2022
2 parents 7589f88 + 5c8fe3a commit 0bde720
Show file tree
Hide file tree
Showing 25 changed files with 638 additions and 520 deletions.
287 changes: 190 additions & 97 deletions Jenkinsfile

Large diffs are not rendered by default.

63 changes: 22 additions & 41 deletions examples/nlp/language_modeling/conf/megatron_bart_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
defaults:
- .@model.encoder: megatron_model_base_config
- .@model.decoder: megatron_model_base_config

name: megatron_bart
restore_from_path: null # used when starting from a .nemo file

Expand All @@ -9,7 +13,7 @@ trainer:
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch.
max_epochs: 1000 # PTL default. In practice, max_steps will be reached first.
max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
Expand All @@ -22,7 +26,7 @@ trainer:
exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_bart
name: ${name}
create_wandb_logger: False
wandb_logger_kwargs:
project: null
Expand All @@ -35,56 +39,28 @@ exp_manager:
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: 'megatron_bart--{val_loss:.2f}-{step}-{consumed_samples}'
filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}

model:
# model parallelism
global_batch_size: 8
micro_batch_size: 4
global_batch_size: 8 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
resume_from_checkpoint: null # manually set the checkpoint file to load from
pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.

# model architecture
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler

megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

seq_length: 512
max_position_embeddings: ${.seq_length}
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
attention_dropout: 0.1 # Dropout probability in the attention layer.
position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative']
relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
decoder_arch: 'transformer' # Options: ['transformer']
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
share_token_embeddings: True # If True share encoder/decoder embeddings
share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits

tokenizer:
library: 'megatron'
Expand All @@ -93,23 +69,28 @@ model:
vocab_file: null
merge_file: null
num_sentinel_tokens: 0 # expected to be 0 for BART
sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

# weight init
embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')

# embedding dropout
embedding_dropout: 0.1

# embedding sharing
share_token_embeddings: True # If True share encoder/decoder embeddings
share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this

# not implemented in NeMo yet
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1

data:
# Path to data must be specified by the user.
# can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-bart_00_text_document,.5,/raid/data/pile/my-bart_01_text_document]",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ model:
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
attention_dropout: 0.1 # Dropout probability in the attention layer.
position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative']
relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
relative_position_bias_self_attention_only: True # whether to only use relative position bias for self attention only.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
arch: 'transformer' # Options: ['transformer', 'perceiver']
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
fp32_residual_connection: False # Use FP32 for residual connections.
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1
60 changes: 19 additions & 41 deletions examples/nlp/language_modeling/conf/megatron_t5_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
defaults:
- .@model.encoder: megatron_model_base_config
- .@model.decoder: megatron_model_base_config

name: megatron_t5
restore_from_path: null # used when starting from a .nemo file

Expand All @@ -22,7 +26,7 @@ trainer:
exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_t5
name: ${name}
create_wandb_logger: False
wandb_logger_kwargs:
project: null
Expand All @@ -35,7 +39,7 @@ exp_manager:
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}'
filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}

model:
Expand All @@ -49,45 +53,14 @@ model:

# model architecture
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler

megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

seq_length: 512
max_position_embeddings: ${.seq_length}
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
attention_dropout: 0.1 # Dropout probability in the attention layer.
position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative']
relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
relative_position_bias_self_attention_only: True # Whether to only use relative position bias for self attention only.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
decoder_arch: 'transformer' # Options: ['transformer']
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
share_token_embeddings: True # If True share encoder/decoder embeddings
share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits

tokenizer:
library: 'megatron'
Expand All @@ -96,23 +69,28 @@ model:
vocab_file: null
merge_file: null
num_sentinel_tokens: 100
sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

# weight init
embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')

# embedding dropout
embedding_dropout: 0.1

# embedding sharing
share_token_embeddings: True # If True share encoder/decoder embeddings
share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this

# not implemented in NeMo yet
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1

data:
# Path to data must be specified by the user.
# can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
Expand Down
Loading

0 comments on commit 0bde720

Please sign in to comment.