Skip to content

Commit

Permalink
Fix args (NVIDIA#5410)
Browse files Browse the repository at this point in the history
Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
  • Loading branch information
MaximumEntropy authored Nov 15, 2022
1 parent dbe41af commit 1b5fac4
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@ onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
fp32_residual_connection: False # Use FP32 for residual connections.
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1
activations_checkpoint_granularity: null
megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF.
normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def get_decoder_model(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
bias_activation_fusion=bias_activation_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def get_encoder_model(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
bias_activation_fusion=bias_activation_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand Down Expand Up @@ -198,6 +199,7 @@ def get_encoder_model(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
bias_activation_fusion=bias_activation_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(
fp32_residual_connection=False,
activations_checkpoint_method=None,
activations_checkpoint_num_layers=1,
activations_checkpoint_granularity=None,
layernorm_epsilon=1e-5,
bias_activation_fusion=True,
bias_dropout_add_fusion=True,
Expand Down Expand Up @@ -119,6 +120,7 @@ def __init__(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(
fp32_residual_connection=False,
activations_checkpoint_method=None,
activations_checkpoint_num_layers=1,
activations_checkpoint_granularity=None,
layernorm_epsilon=1e-5,
bias_activation_fusion=True,
bias_dropout_add_fusion=True,
Expand Down Expand Up @@ -117,6 +118,7 @@ def __init__(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def __init__(
fp32_residual_connection=encoder_cfg.get('fp32_residual_connection', False),
activations_checkpoint_method=encoder_cfg.get('activations_checkpoint_method', None),
activations_checkpoint_num_layers=encoder_cfg.get('activations_checkpoint_num_layers', 1),
activations_checkpoint_granularity=encoder_cfg.get('activations_checkpoint_granularity', None),
layernorm_epsilon=encoder_cfg.get('layernorm_epsilon', 1e-5),
bias_activation_fusion=encoder_cfg.get('bias_activation_fusion', True),
bias_dropout_add_fusion=encoder_cfg.get('bias_dropout_add_fusion', True),
Expand Down Expand Up @@ -279,11 +280,12 @@ def __init__(
use_cpu_initialization=use_cpu_initialization,
hidden_dropout=decoder_cfg.get('hidden_dropout', 0.1),
attention_dropout=decoder_cfg.get('attention_dropout', 0.1),
ffn_dropout=encoder_cfg.get('ffn_dropout', 0.0),
ffn_dropout=decoder_cfg.get('ffn_dropout', 0.0),
precision=precision,
fp32_residual_connection=decoder_cfg.get('fp32_residual_connection', False),
activations_checkpoint_method=decoder_cfg.get('activations_checkpoint_method', None),
activations_checkpoint_num_layers=decoder_cfg.get('activations_checkpoint_num_layers', 1),
activations_checkpoint_granularity=decoder_cfg.get('activations_checkpoint_granularity', None),
layernorm_epsilon=decoder_cfg.get('layernorm_epsilon', 1e-5),
bias_activation_fusion=decoder_cfg.get('bias_activation_fusion', True),
bias_dropout_add_fusion=decoder_cfg.get('bias_dropout_add_fusion', True),
Expand Down

0 comments on commit 1b5fac4

Please sign in to comment.