NVIDIA · ekmb · Jun 12, 2023 · Apr 24, 2023 · Apr 24, 2023 · Apr 24, 2023
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -77,7 +77,7 @@ model:
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
   openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
   normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
-  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope']
+  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'xpos', 'sandwich']
   rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
   attention_type: 'multihead' # Attention type. Options ['multihead']
   share_embeddings_and_output_weights: True # Share embedding and output layer weights.

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
@@ -161,6 +161,7 @@ def __init__(
         fp8_amax_compute_algo='most_recent',
         reduce_amax=True,
         use_emha=False,
+        use_flash_attention=False,
     ):
         super(GPTModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights)
 
@@ -239,6 +240,7 @@ def __init__(
             fp8_amax_compute_algo=fp8_amax_compute_algo,
             reduce_amax=reduce_amax,
             use_emha=use_emha,
+            use_flash_attention=use_flash_attention,
         )
 
         if self.share_embeddings_and_output_weights:

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -253,6 +253,7 @@ def model_provider_func(self, pre_process, post_process):
             fp8_amax_compute_algo=self.cfg.get('fp8_amax_compute_algo', 'most_recent'),
             reduce_amax=self.cfg.get('reduce_amax', True),
             use_emha=self.cfg.get('use_emha', False),
+            use_flash_attention=self.cfg.get('use_flash_attention', False),
         )
 
         return model

diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py
diff --git a/nemo/collections/nlp/modules/common/megatron/kerple_relative_position.py b/nemo/collections/nlp/modules/common/megatron/kerple_relative_position.py
@@ -0,0 +1,94 @@
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from apex.transformer import parallel_state, tensor_parallel
+    from apex.transformer.enums import AttnMaskType, AttnType
+    from apex.transformer.utils import divide as safe_divide
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+    # fake missing classes with None attributes
+    ModelType = AttnMaskType = AttnType = LayerType = ApexGuardDefaults()
+
+
+def get_kerple_log_params(
+    num_attention_heads,
+    precision
+):
+
+    try:
+        model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
+    except:
+        model_parallel_size = 1
+    num_heads_per_partition = safe_divide(num_attention_heads, model_parallel_size)
+
+    dtype_dict = {16: torch.float16, 32: torch.float32, 'bf16': torch.bfloat16}
+
+    def get_parameter(scale, init_method):
+        if init_method == 'ones':
+            return Parameter(torch.ones(
+                            num_heads_per_partition,
+                            device=torch.cuda.current_device(),
+                            dtype=(lambda x, y: x[y])(dtype_dict, precision),
+                            )[:,None,None]*scale )
+        elif init_method == 'uniform':
+            return Parameter(torch.rand(
+                            num_heads_per_partition,
+                            device=torch.cuda.current_device(),
+                            dtype=(lambda x, y: x[y])(dtype_dict, precision),
+                            )[:,None,None]*scale )
+
+    bias_p = get_parameter(2, 'uniform')
+    bias_a = get_parameter(1, 'uniform')
+
+    return torch.concat((bias_p, bias_a))
+
+
+def kerple_log_forward(
+    seq_len_q, seq_len_k, relative_position_bias
+):
+    bias_p, bias_a = torch.split(
+        relative_position_bias, relative_position_bias.size(0)//2, dim=0)
+
+    eps = 1e-2
+
+    # We may be able to save this and avoid recomputing this every time like in the 
+    # reference implementation.
+    # Currently kept this way to be compatible with the checkpointed-attn-forward
+    # TODO: find a way to avoid recomputing this every time.
+    diff = torch.tril(
+        torch.arange(seq_len_k, device=x.device).view(seq_len_k, 1).repeat(1, seq_len_k)
+        + torch.arange(0, -seq_len_k, -1, device=x.device)
+    )
+    diff = diff.to(x.dtype)
+
+    bias_p.data = bias_p.data.clamp(min=eps)
+    bias_a.data = bias_a.data.clamp(min=eps)
+    bias = -bias_p*torch.log(1+bias_a*diff) # log kernel
+
+    if seq_len_q != seq_len_k:
+        # In the train case x has dimensionality [b, np, sq, sk] with sq == sk
+        # The number of query tokens is equal to the number of key tokens
+        # At inference time with cache in layer_past sq is not equal to sk. sq only contains one token (the last one in the full sequence)
+        # In this case we use the appropriate token index of the cache matrix.
+        # As the cache matrix could already be bigger from a past inference, not the last token index in the sq sequence is used
+        assert (
+            seq_len_q == 1
+        ), "assumption sq == sk unless at inference time with cache in layer_past with sq == 1"
+
+        if type(bias) != float:
+            # seq_len_k - 1 points to the last token index in the current inference batch.
+            bias = bias[:, seq_len_k - 1, :].view(bias.shape[0], 1, bias.shape[2])
+
+    return bias
diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py
@@ -23,6 +23,12 @@
 from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
 from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import RotaryEmbedding
 from nemo.collections.nlp.modules.common.megatron.transformer import ParallelTransformer
+from nemo.collections.nlp.modules.common.megatron.alibi_relative_position_embedding import (
+    ALiBiRelativePositionEmbedding,
+)
+from nemo.collections.nlp.modules.common.megatron.kerple_relative_position_embedding import (
+    KERPLERelativePositionEmbedding,
+)
 from nemo.collections.nlp.modules.common.megatron.utils import (
     ApexGuardDefaults,
     get_linear_layer,
@@ -114,6 +120,7 @@ def get_language_model(
     fp8_amax_compute_algo='most_recent',
     reduce_amax=True,
     use_emha=False,
+    use_flash_attention=False,
 ):
     """Build language model and return along with the key to save."""
 
@@ -188,6 +195,7 @@ def get_language_model(
         fp8_amax_compute_algo=fp8_amax_compute_algo,
         reduce_amax=reduce_amax,
         use_emha=use_emha,
+        use_flash_attention=use_flash_attention,
     )
     # key used for checkpoints.
     language_model_key = 'language_model'
@@ -487,6 +495,7 @@ def __init__(
         fp8_amax_compute_algo='most_recent',
         reduce_amax=True,
         use_emha=False,
+        use_flash_attention=False,
     ):
         super(TransformerLanguageModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights)
 
@@ -538,6 +547,30 @@ def __init__(
                 rotary_dim = int(rotary_dim * rotary_percentage)
             self.rotary_pos_emb = RotaryEmbedding(rotary_dim)
 
+        elif position_embedding_type == 'alibi':
+            # TODO: If this is used for encoder-decodemax_position_embeddingsr model, implement proper logic and following
+            # addition for decoder. Currently it is only used for decoder model only. 
+            # Encoder-decoder model, such as T5 is implemented in token_level_encoder_decoder.py
+            self.encoder_relative_position_embedding = ALiBiRelativePositionEmbedding(
+                bidirectional=False,
+                num_attention_heads=num_attention_heads,
+                layer_type=LayerType.encoder,
+                num_attention_heads_alibi=None,
+                max_seq_len=max_position_embeddings,
+            )
+
+        elif position_embedding_type == 'kerple':
+            # TODO: If this is used for encoder-decodemax_position_embeddingsr model, implement proper logic and following
+            # addition for decoder. Currently it is only used for decoder model only. 
+            # Encoder-decoder model, such as T5 is implemented in token_level_encoder_decoder.py
+            self.decoder_relative_position_embedding = KERPLERelativePositionEmbedding(
+                bidirectional=False,
+                num_attention_heads=num_attention_heads,
+                layer_type=LayerType.decoder,
+                num_attention_heads_kerple=None,
+                max_seq_len=max_position_embeddings,
+            )
+
         # Transformer.
         self.encoder = ParallelTransformer(
             init_method=self.init_method,
@@ -588,6 +621,8 @@ def __init__(
             fp8_amax_compute_algo=fp8_amax_compute_algo,
             reduce_amax=reduce_amax,
             use_emha=use_emha,
+            position_embedding_type=position_embedding_type,
+            use_flash_attention=use_flash_attention,
         )
         self._encoder_key = 'encoder'
 
@@ -627,6 +662,8 @@ def __init__(
                 activations_checkpoint_granularity=activations_checkpoint_granularity,
                 activations_checkpoint_layers_per_pipeline=activations_checkpoint_layers_per_pipeline,
                 transformer_engine=transformer_engine,
+                position_embedding_type=position_embedding_type,
+                use_flash_attention=use_flash_attention,
             )
             self._decoder_key = 'decoder'
 
@@ -697,6 +734,8 @@ def forward(
 
         # enc_attn_mask: [1, 1, s, s]
 
+        rotary_pos_emb = None
+        encoder_self_attention_relative_position_bias = None
         if self.position_embedding_type == 'rope':
             if inference_max_sequence_len is not None:
                 rotary_pos_emb = self.rotary_pos_emb(inference_max_sequence_len)
@@ -714,8 +753,14 @@ def forward(
                     )
                 else:
                     rotary_pos_emb = self.rotary_pos_emb(encoder_input.size(0))
-        else:
-            rotary_pos_emb = None
+        elif self.position_embedding_type == 'alibi':
+            enc_seq_length = enc_input_ids.size(1)
+            encoder_self_attention_relative_position_bias = self.encoder_relative_position_embedding(
+                query_seq_length=enc_seq_length, key_seq_length=enc_seq_length,
+            )
+        elif self.position_embedding_type == 'kerple':
+            encoder_self_attention_relative_position_bias = self.encoder_relative_position_embedding
+
 
         # encoder.
         if enc_hidden_states is None:
@@ -730,6 +775,8 @@ def forward(
                 rotary_pos_emb=(rotary_pos_emb, None, None)
                 if rotary_pos_emb is not None
                 else None,  # This assumes that this being used as a GPT/BERT model only (no cross-attention)
+                self_attention_relative_position_bias=encoder_self_attention_relative_position_bias
+                if encoder_self_attention_relative_position_bias is not None else None
             )
         else:
             encoder_output = enc_hidden_states.to(encoder_input.dtype)

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py
@@ -87,6 +87,7 @@ def get_decoder_model(
     moe_dropout=0.0,
     turn_off_rop=False,  # turn off the RoP positional embedding
     version=1,
+    position_embedding_type='learned_absolute'
 ):
     """Build language model and return along with the key to save."""
 
@@ -143,6 +144,7 @@ def get_decoder_model(
             num_moe_experts=num_moe_experts,
             moe_frequency=moe_frequency,
             moe_dropout=moe_dropout,
+            position_embedding_type=position_embedding_type
         )
     elif arch == "retro":
         decoder = MegatronRetrievalTransformerDecoderModule(

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py
@@ -89,6 +89,7 @@ def get_encoder_model(
     moe_dropout=0.0,
     turn_off_rop=False,  # turn off the RoP positional embedding
     version=1,  # model version
+    position_embedding_type='learned_absolute'
 ):
     """Build language model and return along with the key to save."""
 
@@ -145,6 +146,7 @@ def get_encoder_model(
             num_moe_experts=num_moe_experts,
             moe_frequency=moe_frequency,
             moe_dropout=moe_dropout,
+            position_embedding_type=position_embedding_type
         )
     elif arch == "retro":
         encoder = MegatronRetrievalTransformerEncoderModule(

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py
@@ -84,6 +84,7 @@ def __init__(
         num_moe_experts=1,
         moe_frequency=1,
         moe_dropout=0.0,
+        position_embedding_type='learned_absolute'
     ):
         super(MegatronTransformerDecoderModule, self).__init__()
 
@@ -147,6 +148,7 @@ def __init__(
             num_moe_experts=num_moe_experts,
             moe_frequency=moe_frequency,
             moe_dropout=moe_dropout,
+            position_embedding_type=position_embedding_type
         )
         self._model_key = 'model'
 

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py
@@ -81,6 +81,7 @@ def __init__(
         num_moe_experts=1,
         moe_frequency=1,
         moe_dropout=0.0,
+        position_embedding_type='learned_absolute'
     ):
         super(MegatronTransformerEncoderModule, self).__init__()
 
@@ -145,6 +146,7 @@ def __init__(
             num_moe_experts=num_moe_experts,
             moe_frequency=moe_frequency,
             moe_dropout=moe_dropout,
+            position_embedding_type=position_embedding_type
         )
         self._model_key = 'model'
 

diff --git a/nemo/collections/nlp/modules/common/megatron/sandwich_relative_position.py b/nemo/collections/nlp/modules/common/megatron/sandwich_relative_position.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def sandwich_pos_bias(qlen, klen, hidden_size, num_attention_heads, device):
+    context_position = torch.arange(qlen, dtype=torch.long,
+                                    device=device)[:, None]
+    memory_position = torch.arange(klen, dtype=torch.long,
+                                   device=device)[None, :]
+    relative_position = memory_position - context_position  # shape (qlen, klen)
+
+    inv_freq = 1.0 / (10000 ** (2 * torch.arange(1, hidden_size/2,
+                                                 device=device) / hidden_size))
+
+    _bias = torch.sum(
+        relative_position[:,:,None].repeat(1,1,len(inv_freq)) * inv_freq, axis=2)
+    bias = _bias.repeat(num_attention_heads, 1, 1)
+
+    _bias_scales = torch.arange(1, num_attention_heads + 1, 1, device=device)
+    bias_scales = torch.stack(
+        list(map(lambda x, y: x * y, _bias_scales,
+                 torch.ones(num_attention_heads, qlen, klen, device=device))))
+    scaled_bias = (bias - hidden_size / 2 ) / (bias_scales * 8 / num_attention_heads)
+
+    return scaled_bias
diff --git a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py
@@ -237,6 +237,7 @@ def __init__(
                 num_moe_experts=encoder_cfg.get('num_moe_experts', 1),
                 moe_frequency=encoder_cfg.get('moe_frequency', 1),
                 moe_dropout=encoder_cfg.get('moe_dropout', 0.0),
+                position_embedding_type=decoder_cfg.get('position_embedding_type', 'learned_absolute')
             )
 
         if add_decoder:
@@ -365,6 +366,7 @@ def __init__(
                 num_moe_experts=decoder_cfg.get('num_moe_experts', 1),
                 moe_frequency=decoder_cfg.get('moe_frequency', 1),
                 moe_dropout=decoder_cfg.get('moe_dropout', 0.0),
+                position_embedding_type=decoder_cfg.get('position_embedding_type', 'learned_absolute')
             )
 
         self.enc_dec_model = MegatronTransformerEncoderDecoderModule(