bigscience-workshop · thomasw21 · Jan 6, 2022 · Jan 5, 2022 · thomasw21 · Jan 5, 2022
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -303,11 +303,27 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
             query_layer, key_layer = apply_rotary_fn(query_layer, key_layer, cos, sin, offset=offset)
 
         # Raw attention scores. [b * np, sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_result,
-            query_layer.transpose(0, 1),   # [b * np, sq, hn]
-            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0 if alibi is None else 1.0, alpha=(1.0/self.norm_factor))
+        if alibi is None:
+            matmul_result = torch.baddbmm(
+                matmul_result,
+                query_layer.transpose(0, 1),   # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0, alpha=(1.0/self.norm_factor))
+        else:
+            if not hasattr(self, "logged_alibi"):
+                logger.debug("Using Alibi.")
+                self.logged_alibi = True
+
+            if self.apply_query_key_layer_scaling:
+                beta = 1.0 / self.layer_number
+            else:
+                beta = 1.0
+
+            matmul_result = torch.baddbmm(
+                matmul_result,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=beta, alpha=(1.0 / self.norm_factor))
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)
@@ -470,9 +486,19 @@ def __init__(self, init_method, output_layer_init_method,
         self.mlp = ParallelMLP(init_method,
                                output_layer_init_method)
 
+        # Alibi
+        if args.position_embedding_type == PositionEmbeddingType.alibi:
+            self.alibi = self._build_alibi_tensor(args.seq_length, args.num_attention_heads, args.micro_batch_size).to(torch.cuda.current_device())
+            if args.params_dtype == torch.float16:
+                self.alibi = self.alibi.to(torch.float16)
+            elif args.params_dtype == torch.bfloat16:
+                self.alibi = self.alibi.to(torch.bfloat16)
+        else:
+            self.alibi = None
+
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
-                layer_past=None, get_key_value=False, alibi=None):
+                layer_past=None, get_key_value=False):
         # hidden_states: [b, s, h]
 
         # Layer norm at the beginning of the transformer layer.
@@ -483,7 +509,7 @@ def forward(self, hidden_states, attention_mask,
                                 attention_mask,
                                 layer_past=layer_past,
                                 get_key_value=get_key_value,
-                                alibi=alibi)
+                                alibi=self.alibi)
 
         if get_key_value:
             attention_output, presents = attention_output
@@ -561,6 +587,30 @@ def forward(self, hidden_states, attention_mask,
 
         return output
 
+    @staticmethod
+    def _build_alibi_tensor(max_seq_len, num_attention_heads, batch_size):
+        # Based on https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+        """Returns tensor shaped (batch_size * num_attention_heads, 1, max_seq_len)"""
+
+        def get_slopes(n):
+            def get_slopes_power_of_2(n):
+                start = (2 ** (-2 ** -(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio ** i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2 ** math.floor(math.log2(n))
+                return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
+                                                                   :n - closest_power_of_2]
+
+        slopes = torch.Tensor(get_slopes(num_attention_heads))
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(
+            num_attention_heads, -1, -1)
+        alibi = alibi.repeat(batch_size, 1, 1)
+        return alibi
+
 class ParallelTransformerLayerPipe(ParallelTransformerLayer):
     """Extends ParallelTransformerLayer to forward attention_mask through the pipeline.
 
@@ -600,27 +650,6 @@ def forward(self, inputs, **kwargs):
 class ParallelTransformer(MegatronModule):
     """Transformer class."""
 
-    @staticmethod
-    def _build_alibi_tensor(max_seq_len, num_attention_heads, batch_size):
-        # Based on https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-        """Returns tensor shaped (batch_size * num_attention_heads, 1, max_seq_len)"""
-        def get_slopes(n):
-            def get_slopes_power_of_2(n):
-                start = (2 ** (-2 ** -(math.log2(n) - 3)))
-                ratio = start
-                return [start * ratio ** i for i in range(n)]
-
-            if math.log2(n).is_integer():
-                return get_slopes_power_of_2(n)
-            else:
-                closest_power_of_2 = 2 ** math.floor(math.log2(n))
-                return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
-                                                                   :n - closest_power_of_2]
-        slopes = torch.Tensor(get_slopes(num_attention_heads))
-        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(num_attention_heads, -1, -1)
-        alibi = alibi.repeat(batch_size, 1, 1)
-        return alibi
-
     def __init__(self, init_method, output_layer_init_method,
                  layer_type=LayerType.encoder,
                  self_attn_mask_type=AttnMaskType.padding,
@@ -687,20 +716,11 @@ def build_layer(layer_number):
             get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
             checkpoint = deepspeed.checkpointing.checkpoint
 
-        if args.position_embedding_type == PositionEmbeddingType.alibi:
-            self.alibi = self._build_alibi_tensor(args.seq_length, args.num_attention_heads, args.micro_batch_size).to(torch.cuda.current_device())
-            if args.params_dtype == torch.float16:
-                self.alibi = self.alibi.to(torch.float16)
-            elif args.params_dtype == torch.bfloat16:
-                self.alibi = self.alibi.to(torch.bfloat16)
-        else:
-            self.alibi = None
-
     def _get_layer(self, layer_number):
         return self.layers[layer_number]
 
     def _checkpointed_forward(self, hidden_states, attention_mask,
-                              encoder_output, enc_dec_attn_mask, alibi=None):
+                              encoder_output, enc_dec_attn_mask):
         """Forward method with activation checkpointing."""
         def custom(start, end):
             def custom_forward(*inputs):
@@ -710,7 +730,7 @@ def custom_forward(*inputs):
                 enc_dec_attn_mask = inputs[3]
                 for index in range(start, end):
                     layer = self._get_layer(index)
-                    x_ = layer(x_, attention_mask, encoder_output, enc_dec_attn_mask, alibi=alibi)
+                    x_ = layer(x_, attention_mask, encoder_output, enc_dec_attn_mask)
                 return x_
             return custom_forward
 
@@ -767,8 +787,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
             hidden_states = self._checkpointed_forward(hidden_states,
                                                        attention_mask,
                                                        encoder_output,
-                                                       enc_dec_attn_mask,
-                                                       alibi=self.alibi)
+                                                       enc_dec_attn_mask)
         else:
             if get_key_value:
                 presents = []
@@ -782,8 +801,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
                                       encoder_output=encoder_output,
                                       enc_dec_attn_mask=enc_dec_attn_mask,
                                       layer_past=past,
-                                      get_key_value=get_key_value,
-                                      alibi=self.alibi)
+                                      get_key_value=get_key_value)
                 if get_key_value:
                     hidden_states, present = hidden_states
                     presents.append(present)

diff --git a/tests/test_training.py b/tests/test_training.py
@@ -225,6 +225,20 @@ def get_variation_config(self, variation, output_dir, n_samples=None):
                 --deepspeed_config {self.test_file_dir_str}/ds_config.json
             """.split()
 
+        elif variation == "alibi":
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+                --position-embedding-type alibi
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
         else:
             raise ValueError(f"Don't know of variation {variation}")
 
@@ -234,7 +248,7 @@ def get_variation_config(self, variation, output_dir, n_samples=None):
         return args, ds_args, num_gpus
 
 
-    @parameterized.expand(["base", "cl", "bnb", "glu"])
+    @parameterized.expand(["base", "cl", "bnb", "glu", "alibi"])
     def test_training_all(self, variation):
 
         # optional runs
@@ -277,6 +291,9 @@ def test_training_all(self, variation):
         if variation == "glu":
             self.assertIn("Using GLU activation: GELU", cs.out)
 
+        if variation == "alibi":
+            self.assertIn("Using Alibi", cs.out)
+
         # 2. test training from checkpoint: resume
         # now do it again, this time resuming from the checkpoint
         with CaptureStdout() as cs: