Skip to content

Commit

Permalink
Merge branch 'main' into cmudict_changes
Browse files Browse the repository at this point in the history
  • Loading branch information
jasro23 authored Jul 6, 2022
2 parents 049c2fe + ab6c46b commit dc5542c
Show file tree
Hide file tree
Showing 16 changed files with 410 additions and 50 deletions.
8 changes: 5 additions & 3 deletions examples/nlp/language_modeling/conf/megatron_bart_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ model:

seq_length: 512
max_position_embeddings: ${.seq_length}
num_layers: 12
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
Expand All @@ -76,11 +76,13 @@ model:
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
encoder_arch: 'transformer'
decoder_arch: 'transformer'
encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
decoder_arch: 'transformer' # Options: ['transformer']
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.

tokenizer:
library: 'megatron'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ model:
sched:
name: CosineAnnealing
warmup_steps: 50
min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
constant_steps: 0 # Constant steps should also be 0 when min_lr=0
min_lr: 0.0 # min_lr must be 0.0 for prompt learning
monitor: val_loss
reduce_on_plateau: false
8 changes: 5 additions & 3 deletions examples/nlp/language_modeling/conf/megatron_t5_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ model:

seq_length: 512
max_position_embeddings: ${.seq_length}
num_layers: 12
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
Expand All @@ -78,11 +78,13 @@ model:
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
encoder_arch: 'transformer'
decoder_arch: 'transformer'
encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
decoder_arch: 'transformer' # Options: ['transformer']
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.

tokenizer:
library: 'megatron'
Expand Down
8 changes: 5 additions & 3 deletions examples/nlp/language_modeling/conf/megatron_ul2_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ model:

seq_length: 512
max_position_embeddings: ${.seq_length}
num_layers: 12
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
Expand All @@ -75,11 +75,13 @@ model:
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
encoder_arch: 'transformer'
decoder_arch: 'transformer'
encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
decoder_arch: 'transformer' # Options: ['transformer']
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.

tokenizer:
library: 'megatron'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ model:

seq_length: 512
max_position_embeddings: ${.seq_length}
num_layers: 12
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
Expand All @@ -91,6 +91,8 @@ model:
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@


class BARTDataset(T5Dataset):
# account for added tokens
MAX_SEQ_LENGTH_DELTA = 1

def __init__(
self,
cfg,
Expand Down Expand Up @@ -77,8 +80,8 @@ def pad_and_convert_to_numpy(
self, tokens, output_tokens, masked_positions, masked_labels, masked_spans=None, np_rng=None,
):
"""Pad sequences and convert them to numpy."""
bart_decoder_in = [self.bos_id] + tokens[:-1]
bart_decoder_out = tokens
bart_decoder_in = [self.bos_id] + tokens
bart_decoder_out = tokens + [self.eos_id]

if masked_spans is not None:
# construct bart input by collapsing multiple <mask> into one, and delete randomly
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@


class T5Dataset(Dataset):
# account for added tokens
MAX_SEQ_LENGTH_DELTA = 2

def __init__(
self,
cfg,
Expand Down Expand Up @@ -86,7 +89,7 @@ def __init__(
data_prefix=data_prefix,
num_epochs=num_epochs,
max_num_samples=max_num_samples,
max_seq_length=self.max_seq_length - 2, # account for added tokens
max_seq_length=self.max_seq_length - self.MAX_SEQ_LENGTH_DELTA, # account for added tokens
short_seq_prob=self.short_seq_prob,
seed=self.seed,
name=self.name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,21 +104,20 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
override_config_path=frozen_model_cfg,
)

if self.frozen_model.cfg.precision == 16:
self.float_type = torch.float16
elif self.frozen_model.cfg.precision == 'bf16':
self.float_type = torch.bfloat16
else:
self.float_type = torch.float

# TODO: Enable amp_o2 training
self.megatron_amp_o2 = False
self.pipeline_parallel = self.cfg.get('pipeline_model_parallel_size', 1) > 1
self.tokenizer = self.frozen_model.tokenizer
self.hidden_size = self.frozen_model.cfg.hidden_size
self.existing_tasks = list(self.cfg.get('existing_tasks', []))
self.new_tasks = list(self.cfg.get('new_tasks', []))
self.virtual_prompt_style = VirtualPromptStyle(cfg.virtual_prompt_style)

if self.pipeline_parallel:
assert (
self.cfg.optim.sched.get("min_lr", 0.0) == 0.0
), "Minimum lr must be 0.0 when pipeline parallel size is > 1"

# Load templates for assigning virtual prompt token positions
self.load_task_templates(self.cfg.task_templates)

Expand Down Expand Up @@ -348,16 +347,33 @@ def setup_optimizer_param_groups(self):
to be passed around in pipeline parallel models. The prompt-encoder
and/or prompt table will use the learning rate set by the user.
"""
virtual_prompt_params = {'params': []}
frozen_model_params = {'params': [param for param in self.frozen_model.parameters()], 'lr': 0.0}
# Freeze frozen model
for param in self.frozen_model.parameters():
param.requires_grad = False

if self.frozen_model.model.pre_process:
virtual_prompt_params['params'].extend([param for param in self.prompt_table.parameters()])
# Need to handle frozen model freezing differently when pp > 1
if self.pipeline_parallel:
virtual_prompt_params = {'params': []}
frozen_model_params = {'params': [], 'lr': 0.0}

if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
virtual_prompt_params['params'].extend([param for param in self.prompt_encoder.parameters()])
if self.frozen_model.model.pre_process:
virtual_prompt_params['params'].extend([param for param in self.prompt_table.parameters()])

if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
virtual_prompt_params['params'].extend([param for param in self.prompt_encoder.parameters()])

self._optimizer_param_groups = virtual_prompt_params, frozen_model_params
# Unfreeze one part of each transformer layer setting lr to 0.0 so DDP
# and AMP won't complain but model still remains frozen
for layer in self.frozen_model.model.language_model.encoder.layers:
for param in layer.input_layernorm.parameters():
param.requires_grad = True

frozen_model_params['params'].extend([param for param in self.frozen_model.parameters()])

self._optimizer_param_groups = virtual_prompt_params, frozen_model_params

else:
super().setup_optimizer_param_groups()

def forward(
self,
Expand Down Expand Up @@ -388,7 +404,7 @@ def forward(
encoder_input = None

# Call forward on GPT model with preprocessed embeddings
if self.float_type == torch.float32:
if self.autocast_dtype == torch.float32:
output = self.frozen_model.model(
input_ids=None,
position_ids=None,
Expand All @@ -399,7 +415,7 @@ def forward(
inference_max_sequence_len=inference_max_sequence_len,
)
else:
with torch.autocast(device_type="cuda", dtype=self.float_type):
with torch.autocast(device_type="cuda", dtype=self.autocast_dtype):
output = self.frozen_model.model(
input_ids=None,
position_ids=None,
Expand Down Expand Up @@ -524,7 +540,7 @@ def fwd_bwd_step(self, batch, batch_idx, forward_only):
_, seq_length = batch[0].shape
tensor_shape = [seq_length, self.cfg.micro_batch_size, self.hidden_size]

if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
if self.pipeline_parallel:
losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving(
forward_step_func=self.get_forward_output_and_loss_func(),
batch=batch,
Expand Down Expand Up @@ -580,7 +596,8 @@ def training_step(self, batch, batch_idx):

# Need to make sure the frozen model param learning rate stays 0.0
# so forceing lr to be 0.0 for gpt layers before param update
self._optimizer.param_groups[1]['lr'] = 0.0
if self.pipeline_parallel:
self._optimizer.param_groups[1]['lr'] = 0.0

return loss_mean

Expand Down Expand Up @@ -712,24 +729,24 @@ def build_virtual_prompt_dataset(
task_templates=self.task_templates,
pseudo_tokens=self.pseudo_tokens,
pad_token_id=self.pad_token_id,
max_seq_length=self.cfg.data.get('max_seq_length', self.frozen_model.cfg.max_position_embeddings),
max_seq_length=self.frozen_model.cfg.encoder_seq_length,
min_seq_length=self.cfg.data.get('min_seq_length', 1),
add_bos=self.cfg.data.get('add_bos', False),
add_eos=self.cfg.data.get('add_eos', True),
for_train=for_train,
)

rank = parallel_state.get_data_parallel_rank()
world_size = parallel_state.get_data_parallel_world_size()
data_parallel_size = parallel_state.get_data_parallel_world_size()
sampler = torch.utils.data.distributed.DistributedSampler(
dataset, num_replicas=world_size, rank=rank, shuffle=shuffle
dataset, num_replicas=data_parallel_size, rank=rank, shuffle=shuffle
)

dataloader = torch.utils.data.DataLoader(
dataset,
collate_fn=dataset.collate_fn,
sampler=sampler,
batch_size=batch_size,
batch_size=batch_size // data_parallel_size,
drop_last=drop_last,
num_workers=num_workers,
pin_memory=pin_memory,
Expand Down Expand Up @@ -771,7 +788,7 @@ def dummy():
task_templates=self.task_templates,
pseudo_tokens=self.pseudo_tokens,
pad_token_id=self.pad_token_id,
max_seq_length=self.cfg.data.get('max_seq_length', self.frozen_model.cfg.max_position_embeddings),
max_seq_length=self.frozen_model.cfg.encoder_seq_length,
min_seq_length=self.cfg.data.get('min_seq_length', 1),
add_bos=sampling_params["add_BOS"],
add_eos=False,
Expand Down Expand Up @@ -820,7 +837,7 @@ def set_input_tensor(self, input_tensor):
model's forward_step_func won't have it. This function is thus
used by internal code to bypass the input provided by the
forward_step_func"""
# self.input_tensor = input_tensor

self.frozen_model.model.set_input_tensor(input_tensor)

def get_forward_output_and_loss_func(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def setup_optimizer_param_groups(self):

def model_provider_func(self, pre_process, post_process, add_encoder, add_decoder):
# TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim)
if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.encoder_arch == 'perceiver':
raise ValueError(f"Perceivers with pipeline parallel > 1 is not supported yet.")
if hasattr(self.cfg, 'bias_gelu_fusion'):
logging.warning('bias_gelu_fusion is deprecated. Please use bias_activation_fusion instead.')
activation_fusion = self.cfg.bias_gelu_fusion
Expand Down Expand Up @@ -163,6 +165,8 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode
normalization=self.cfg.get('normalization', 'layernorm'),
transformer_block_type=self.cfg.get('transformer_block_type', 'pre_ln'),
headscale=self.cfg.get('headscale', False),
hidden_steps=self.cfg.get('hidden_steps', -1),
num_self_attention_per_cross_attention=self.cfg.get('num_self_attention_per_cross_attention', 1),
add_encoder=add_encoder,
add_decoder=add_decoder,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def get_decoder_model(
headscale=False,
transformer_block_type="pre_ln",
hidden_steps=-1,
hidden_blocks=1,
parent_model_type=ModelType.encoder_or_decoder,
layer_type=None,
chunk_size=64,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
# limitations under the License.

"""Transformer based language model."""
import torch

from nemo.collections.nlp.modules.common.megatron.megatron_perceiver_encoders import MegatronPerceiverEncoderModule
from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults

Expand Down Expand Up @@ -41,15 +43,25 @@ def __init__(
# AttnMaskType enum mask type (e.g., padding, casual)
encoder_attn_mask_type: AttnMaskType = None,
decoder_attn_mask_type: AttnMaskType = None,
hidden_steps: int = None,
):
super(MegatronTransformerEncoderDecoderModule, self).__init__()

self.encoder = encoder
self.decoder = decoder
self.hidden_steps = hidden_steps
if isinstance(encoder, MegatronPerceiverEncoderModule) and hidden_steps is None:
raise ValueError(
f"hidden_steps cannot be None for perceiver encoders. It is needed to compute the encoder-decoder cross attention mask."
)

# try to infer mask_type if not given
if encoder_attn_mask_type is None:
if encoder is None:
encoder_attn_mask_type = None
# Perceiver does not have a `.model` attribute, assume it always uses padding mask.
elif isinstance(encoder, MegatronPerceiverEncoderModule):
encoder_attn_mask_type = AttnMaskType.padding
elif hasattr(encoder.model, 'self_attn_mask_type'):
encoder_attn_mask_type = encoder.model.self_attn_mask_type
else:
Expand Down Expand Up @@ -136,6 +148,10 @@ def forward(
return enc_output

# decoder
# Adjust encoder attention mask if encoder is a perceiver.
if self.encoder is not None and isinstance(self.encoder, MegatronPerceiverEncoderModule):
enc_attn_mask = torch.ones(enc_output.size(0), self.hidden_steps).to(enc_output.device)

dec_output = self.decode(
dec_input=dec_input,
dec_attn_mask=dec_attn_mask,
Expand Down
Loading

0 comments on commit dc5542c

Please sign in to comment.