From 1b5fac4ab0a5067df55283c15890b8ed891ad28c Mon Sep 17 00:00:00 2001 From: Sandeep Subramanian Date: Mon, 14 Nov 2022 17:29:44 -0800 Subject: [PATCH] Fix args (#5410) Signed-off-by: MaximumEntropy Signed-off-by: MaximumEntropy --- .../language_modeling/conf/megatron_model_base_config.yaml | 1 + .../nlp/modules/common/megatron/megatron_decoders.py | 1 + .../nlp/modules/common/megatron/megatron_encoders.py | 2 ++ .../modules/common/megatron/megatron_transformer_decoder.py | 2 ++ .../modules/common/megatron/megatron_transformer_encoder.py | 2 ++ .../modules/common/megatron/token_level_encoder_decoder.py | 4 +++- 6 files changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml index f68b9ecf87b2..1602cda23731 100644 --- a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml @@ -31,5 +31,6 @@ onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. fp32_residual_connection: False # Use FP32 for residual connections. activations_checkpoint_method: null # 'uniform', 'block' activations_checkpoint_num_layers: 1 +activations_checkpoint_granularity: null megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF. normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py index 63d14cfe84d1..901d55ef4511 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py @@ -119,6 +119,7 @@ def get_decoder_model( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py index 1917979fc66a..6b6a44c036e9 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py @@ -121,6 +121,7 @@ def get_encoder_model( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, @@ -198,6 +199,7 @@ def get_encoder_model( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py index 5104855c860d..530eeffaf466 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py @@ -65,6 +65,7 @@ def __init__( fp32_residual_connection=False, activations_checkpoint_method=None, activations_checkpoint_num_layers=1, + activations_checkpoint_granularity=None, layernorm_epsilon=1e-5, bias_activation_fusion=True, bias_dropout_add_fusion=True, @@ -119,6 +120,7 @@ def __init__( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py index b48d89cd9644..4b1799680d54 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py @@ -62,6 +62,7 @@ def __init__( fp32_residual_connection=False, activations_checkpoint_method=None, activations_checkpoint_num_layers=1, + activations_checkpoint_granularity=None, layernorm_epsilon=1e-5, bias_activation_fusion=True, bias_dropout_add_fusion=True, @@ -117,6 +118,7 @@ def __init__( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, diff --git a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py index 758acaa6644a..78a71492f7e1 100644 --- a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py @@ -179,6 +179,7 @@ def __init__( fp32_residual_connection=encoder_cfg.get('fp32_residual_connection', False), activations_checkpoint_method=encoder_cfg.get('activations_checkpoint_method', None), activations_checkpoint_num_layers=encoder_cfg.get('activations_checkpoint_num_layers', 1), + activations_checkpoint_granularity=encoder_cfg.get('activations_checkpoint_granularity', None), layernorm_epsilon=encoder_cfg.get('layernorm_epsilon', 1e-5), bias_activation_fusion=encoder_cfg.get('bias_activation_fusion', True), bias_dropout_add_fusion=encoder_cfg.get('bias_dropout_add_fusion', True), @@ -279,11 +280,12 @@ def __init__( use_cpu_initialization=use_cpu_initialization, hidden_dropout=decoder_cfg.get('hidden_dropout', 0.1), attention_dropout=decoder_cfg.get('attention_dropout', 0.1), - ffn_dropout=encoder_cfg.get('ffn_dropout', 0.0), + ffn_dropout=decoder_cfg.get('ffn_dropout', 0.0), precision=precision, fp32_residual_connection=decoder_cfg.get('fp32_residual_connection', False), activations_checkpoint_method=decoder_cfg.get('activations_checkpoint_method', None), activations_checkpoint_num_layers=decoder_cfg.get('activations_checkpoint_num_layers', 1), + activations_checkpoint_granularity=decoder_cfg.get('activations_checkpoint_granularity', None), layernorm_epsilon=decoder_cfg.get('layernorm_epsilon', 1e-5), bias_activation_fusion=decoder_cfg.get('bias_activation_fusion', True), bias_dropout_add_fusion=decoder_cfg.get('bias_dropout_add_fusion', True),