Merge branch 'main' into cmudict_changes

NVIDIA · Jul 6, 2022 · dc5542c · dc5542c
2 parents 049c2fe + ab6c46b
commit dc5542c
Show file tree

Hide file tree

Showing 16 changed files with 410 additions and 50 deletions.
diff --git a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml
@@ -56,7 +56,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -76,11 +76,13 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.
   normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer'
-  decoder_arch: 'transformer'
+  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
+  decoder_arch: 'transformer' # Options: ['transformer']
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   tokenizer:
     library: 'megatron'

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
@@ -104,7 +104,7 @@ model:
     sched:
       name: CosineAnnealing
       warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
       constant_steps: 0 # Constant steps should also be 0 when min_lr=0
-      min_lr: 0.0 # min_lr must be 0.0 for prompt learning
       monitor: val_loss
       reduce_on_plateau: false
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -57,7 +57,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -78,11 +78,13 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.
   normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer'
-  decoder_arch: 'transformer'
+  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
+  decoder_arch: 'transformer' # Options: ['transformer']
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   tokenizer:
     library: 'megatron'

diff --git a/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml b/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml
@@ -55,7 +55,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -75,11 +75,13 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.
   normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
-  encoder_arch: 'transformer'
-  decoder_arch: 'transformer'
+  encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
+  decoder_arch: 'transformer' # Options: ['transformer']
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   tokenizer:
     library: 'megatron'

diff --git a/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml b/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml
@@ -66,7 +66,7 @@ model:
 
   seq_length: 512
   max_position_embeddings: ${.seq_length}
-  num_layers: 12
+  num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
@@ -91,6 +91,8 @@ model:
   activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
+  num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
 
   # precision
   native_amp_init_scale: 4294967296 # 2 ** 32

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/bart_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/bart_dataset.py
@@ -20,6 +20,9 @@
 
 
 class BARTDataset(T5Dataset):
+    # account for added tokens
+    MAX_SEQ_LENGTH_DELTA = 1
+
     def __init__(
         self,
         cfg,
@@ -77,8 +80,8 @@ def pad_and_convert_to_numpy(
         self, tokens, output_tokens, masked_positions, masked_labels, masked_spans=None, np_rng=None,
     ):
         """Pad sequences and convert them to numpy."""
-        bart_decoder_in = [self.bos_id] + tokens[:-1]
-        bart_decoder_out = tokens
+        bart_decoder_in = [self.bos_id] + tokens
+        bart_decoder_out = tokens + [self.eos_id]
 
         if masked_spans is not None:
             # construct bart input by collapsing multiple <mask> into one, and delete randomly

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
@@ -29,6 +29,9 @@
 
 
 class T5Dataset(Dataset):
+    # account for added tokens
+    MAX_SEQ_LENGTH_DELTA = 2
+
     def __init__(
         self,
         cfg,
@@ -86,7 +89,7 @@ def __init__(
             data_prefix=data_prefix,
             num_epochs=num_epochs,
             max_num_samples=max_num_samples,
-            max_seq_length=self.max_seq_length - 2,  # account for added tokens
+            max_seq_length=self.max_seq_length - self.MAX_SEQ_LENGTH_DELTA,  # account for added tokens
             short_seq_prob=self.short_seq_prob,
             seed=self.seed,
             name=self.name,

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -104,21 +104,20 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 override_config_path=frozen_model_cfg,
             )
 
-        if self.frozen_model.cfg.precision == 16:
-            self.float_type = torch.float16
-        elif self.frozen_model.cfg.precision == 'bf16':
-            self.float_type = torch.bfloat16
-        else:
-            self.float_type = torch.float
-
         # TODO: Enable amp_o2 training
         self.megatron_amp_o2 = False
+        self.pipeline_parallel = self.cfg.get('pipeline_model_parallel_size', 1) > 1
         self.tokenizer = self.frozen_model.tokenizer
         self.hidden_size = self.frozen_model.cfg.hidden_size
         self.existing_tasks = list(self.cfg.get('existing_tasks', []))
         self.new_tasks = list(self.cfg.get('new_tasks', []))
         self.virtual_prompt_style = VirtualPromptStyle(cfg.virtual_prompt_style)
 
+        if self.pipeline_parallel:
+            assert (
+                self.cfg.optim.sched.get("min_lr", 0.0) == 0.0
+            ), "Minimum lr must be 0.0 when pipeline parallel size is > 1"
+
         # Load templates for assigning virtual prompt token positions
         self.load_task_templates(self.cfg.task_templates)
 
@@ -348,16 +347,33 @@ def setup_optimizer_param_groups(self):
         to be passed around in pipeline parallel models. The prompt-encoder 
         and/or prompt table will use the learning rate set by the user. 
         """
-        virtual_prompt_params = {'params': []}
-        frozen_model_params = {'params': [param for param in self.frozen_model.parameters()], 'lr': 0.0}
+        # Freeze frozen model
+        for param in self.frozen_model.parameters():
+            param.requires_grad = False
 
-        if self.frozen_model.model.pre_process:
-            virtual_prompt_params['params'].extend([param for param in self.prompt_table.parameters()])
+        # Need to handle frozen model freezing differently when pp > 1
+        if self.pipeline_parallel:
+            virtual_prompt_params = {'params': []}
+            frozen_model_params = {'params': [], 'lr': 0.0}
 
-            if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
-                virtual_prompt_params['params'].extend([param for param in self.prompt_encoder.parameters()])
+            if self.frozen_model.model.pre_process:
+                virtual_prompt_params['params'].extend([param for param in self.prompt_table.parameters()])
+
+                if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
+                    virtual_prompt_params['params'].extend([param for param in self.prompt_encoder.parameters()])
 
-        self._optimizer_param_groups = virtual_prompt_params, frozen_model_params
+            # Unfreeze one part of each transformer layer setting lr to 0.0 so DDP
+            # and AMP won't complain but model still remains frozen
+            for layer in self.frozen_model.model.language_model.encoder.layers:
+                for param in layer.input_layernorm.parameters():
+                    param.requires_grad = True
+
+            frozen_model_params['params'].extend([param for param in self.frozen_model.parameters()])
+
+            self._optimizer_param_groups = virtual_prompt_params, frozen_model_params
+
+        else:
+            super().setup_optimizer_param_groups()
 
     def forward(
         self,
@@ -388,7 +404,7 @@ def forward(
             encoder_input = None
 
         # Call forward on GPT model with preprocessed embeddings
-        if self.float_type == torch.float32:
+        if self.autocast_dtype == torch.float32:
             output = self.frozen_model.model(
                 input_ids=None,
                 position_ids=None,
@@ -399,7 +415,7 @@ def forward(
                 inference_max_sequence_len=inference_max_sequence_len,
             )
         else:
-            with torch.autocast(device_type="cuda", dtype=self.float_type):
+            with torch.autocast(device_type="cuda", dtype=self.autocast_dtype):
                 output = self.frozen_model.model(
                     input_ids=None,
                     position_ids=None,
@@ -524,7 +540,7 @@ def fwd_bwd_step(self, batch, batch_idx, forward_only):
         _, seq_length = batch[0].shape
         tensor_shape = [seq_length, self.cfg.micro_batch_size, self.hidden_size]
 
-        if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
+        if self.pipeline_parallel:
             losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving(
                 forward_step_func=self.get_forward_output_and_loss_func(),
                 batch=batch,
@@ -580,7 +596,8 @@ def training_step(self, batch, batch_idx):
 
         # Need to make sure the frozen model param learning rate stays 0.0
         # so forceing lr to be 0.0 for gpt layers before param update
-        self._optimizer.param_groups[1]['lr'] = 0.0
+        if self.pipeline_parallel:
+            self._optimizer.param_groups[1]['lr'] = 0.0
 
         return loss_mean
 
@@ -712,24 +729,24 @@ def build_virtual_prompt_dataset(
             task_templates=self.task_templates,
             pseudo_tokens=self.pseudo_tokens,
             pad_token_id=self.pad_token_id,
-            max_seq_length=self.cfg.data.get('max_seq_length', self.frozen_model.cfg.max_position_embeddings),
+            max_seq_length=self.frozen_model.cfg.encoder_seq_length,
             min_seq_length=self.cfg.data.get('min_seq_length', 1),
             add_bos=self.cfg.data.get('add_bos', False),
             add_eos=self.cfg.data.get('add_eos', True),
             for_train=for_train,
         )
 
         rank = parallel_state.get_data_parallel_rank()
-        world_size = parallel_state.get_data_parallel_world_size()
+        data_parallel_size = parallel_state.get_data_parallel_world_size()
         sampler = torch.utils.data.distributed.DistributedSampler(
-            dataset, num_replicas=world_size, rank=rank, shuffle=shuffle
+            dataset, num_replicas=data_parallel_size, rank=rank, shuffle=shuffle
         )
 
         dataloader = torch.utils.data.DataLoader(
             dataset,
             collate_fn=dataset.collate_fn,
             sampler=sampler,
-            batch_size=batch_size,
+            batch_size=batch_size // data_parallel_size,
             drop_last=drop_last,
             num_workers=num_workers,
             pin_memory=pin_memory,
@@ -771,7 +788,7 @@ def dummy():
             task_templates=self.task_templates,
             pseudo_tokens=self.pseudo_tokens,
             pad_token_id=self.pad_token_id,
-            max_seq_length=self.cfg.data.get('max_seq_length', self.frozen_model.cfg.max_position_embeddings),
+            max_seq_length=self.frozen_model.cfg.encoder_seq_length,
             min_seq_length=self.cfg.data.get('min_seq_length', 1),
             add_bos=sampling_params["add_BOS"],
             add_eos=False,
@@ -820,7 +837,7 @@ def set_input_tensor(self, input_tensor):
         model's forward_step_func won't have it. This function is thus
         used by internal code to bypass the input provided by the
         forward_step_func"""
-        # self.input_tensor = input_tensor
+
         self.frozen_model.model.set_input_tensor(input_tensor)
 
     def get_forward_output_and_loss_func(self):

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -120,6 +120,8 @@ def setup_optimizer_param_groups(self):
 
     def model_provider_func(self, pre_process, post_process, add_encoder, add_decoder):
         # TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim)
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.encoder_arch == 'perceiver':
+            raise ValueError(f"Perceivers with pipeline parallel > 1 is not supported yet.")
         if hasattr(self.cfg, 'bias_gelu_fusion'):
             logging.warning('bias_gelu_fusion is deprecated. Please use bias_activation_fusion instead.')
             activation_fusion = self.cfg.bias_gelu_fusion
@@ -163,6 +165,8 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode
             normalization=self.cfg.get('normalization', 'layernorm'),
             transformer_block_type=self.cfg.get('transformer_block_type', 'pre_ln'),
             headscale=self.cfg.get('headscale', False),
+            hidden_steps=self.cfg.get('hidden_steps', -1),
+            num_self_attention_per_cross_attention=self.cfg.get('num_self_attention_per_cross_attention', 1),
             add_encoder=add_encoder,
             add_decoder=add_decoder,
         )

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py
@@ -76,7 +76,6 @@ def get_decoder_model(
     headscale=False,
     transformer_block_type="pre_ln",
     hidden_steps=-1,
-    hidden_blocks=1,
     parent_model_type=ModelType.encoder_or_decoder,
     layer_type=None,
     chunk_size=64,

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoder_decoder.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 """Transformer based language model."""
+import torch
 
+from nemo.collections.nlp.modules.common.megatron.megatron_perceiver_encoders import MegatronPerceiverEncoderModule
 from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
 from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 
@@ -41,15 +43,25 @@ def __init__(
         # AttnMaskType enum mask type (e.g., padding, casual)
         encoder_attn_mask_type: AttnMaskType = None,
         decoder_attn_mask_type: AttnMaskType = None,
+        hidden_steps: int = None,
     ):
         super(MegatronTransformerEncoderDecoderModule, self).__init__()
 
         self.encoder = encoder
         self.decoder = decoder
+        self.hidden_steps = hidden_steps
+        if isinstance(encoder, MegatronPerceiverEncoderModule) and hidden_steps is None:
+            raise ValueError(
+                f"hidden_steps cannot be None for perceiver encoders. It is needed to compute the encoder-decoder cross attention mask."
+            )
+
         # try to infer mask_type if not given
         if encoder_attn_mask_type is None:
             if encoder is None:
                 encoder_attn_mask_type = None
+            # Perceiver does not have a `.model` attribute, assume it always uses padding mask.
+            elif isinstance(encoder, MegatronPerceiverEncoderModule):
+                encoder_attn_mask_type = AttnMaskType.padding
             elif hasattr(encoder.model, 'self_attn_mask_type'):
                 encoder_attn_mask_type = encoder.model.self_attn_mask_type
             else:
@@ -136,6 +148,10 @@ def forward(
             return enc_output
 
         # decoder
+        # Adjust encoder attention mask if encoder is a perceiver.
+        if self.encoder is not None and isinstance(self.encoder, MegatronPerceiverEncoderModule):
+            enc_attn_mask = torch.ones(enc_output.size(0), self.hidden_steps).to(enc_output.device)
+
         dec_output = self.decode(
             dec_input=dec_input,
             dec_attn_mask=dec_attn_mask,