From d7b5c1c28c0736c6aab5c8a74b2054b32d7f7c6f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 4 Jul 2023 10:28:31 +0800
Subject: [PATCH] [shardformer] added development protocol for standardization
 (#4149)

---
 colossalai/shardformer/README.md              | 13 ++++
 colossalai/shardformer/model/modeling_bert.py | 67 ------------------
 .../{model => modeling}/__init__.py           |  0
 colossalai/shardformer/modeling/bloom.py      | 69 +++++++++++++++++++
 colossalai/shardformer/policies/bloom.py      | 64 ++---------------
 5 files changed, 86 insertions(+), 127 deletions(-)
 delete mode 100644 colossalai/shardformer/model/modeling_bert.py
 rename colossalai/shardformer/{model => modeling}/__init__.py (100%)
 create mode 100644 colossalai/shardformer/modeling/bloom.py
diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index f5d8bb35d91d..fca401562be6 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -321,6 +321,19 @@ This section serves as the guideline for writing new policies and register them
 
 You can create a new file in the `colossalai/shardformer/policies` folder and name the file with the model name. You can implement your policy in this file. You should not import the any model zoo library at the header section of the file because we do not want to import the library when we do not use the policy. Libraries such as `transformers` should be imported only in the function body when needed.
 
+Please follow the following protocols when writing your policy:
+
+- You have to make a clear decision what you want to replace exactly in the original PyTorch module
+   - Use `ModulePolicyDescription.attribute_replacement` to replace the module attributes
+   - Use `ModulePolicyDescription.param_replacement` to replace the module parameters
+   - Use `ModulePolicyDescription.sub_module_replacement` to replace the submodules completely. The target module should implement the `from_native_module` for the .
+   - Use `ModulePolicyDescription.method_replacement` to replace the module methods. **These replacement methods should be put in the `shardformer/modeling/<model-name>.py`**.
+- You can implement the `ParallelModule` for primitive modules in the `shardformer/layer/<model-name>.py` file. Primitive modules refer to modules which are not composed of other modules. For example, the `torch.nn.Linear` module is a primitive module while modules such as `BertEncoder` module in the `transformers` library is a composite module. Primitive modules do not nested inner `nn.Module` members. For composite modules, you should consider using `ModulePolicyDescription` to implement your replacement.
+- `ParallelModule` is meant to be used in two ways: `ParallelModule.from_native_module` to convert native PyTorch module to the `ParallelModule` and `ParallelModule(...)` to instantiate the module directly just like a normal PyTorch module. `ParallelModule` should be only implemented for modules whose weights are sharded. If you want to make your module compatible with the `ModulePolicyDescription.sub_module_replacement` and there is no weight sharding in your module, you can just implement the `from_native_module` method without inheriting the `ParallelModule` like `colossalai/shardformer/layer/normalization.py`.
+- **Do not import any file in the `colossalai/shardformer/policies` and `colossalai/shardformer/modeling` to avoid unwanted import error**. For example, a file in these folders accidentally imports `transformers` library at the top of the file, then the user will have to install `transformers` library even if they do not use this file. Any file in the `modeling` folder should be only imported by the policy file. A policy implementation should be only imported dynamically via the autopolicy or manually via the `ShardFormer` module.
+- Try to keep your import statement on third-party libraries such as `transformers` within the function body instead of the header section of the file. This is because we do not want to import the library when we do not use the policy.
+
+
 - Step 2. Register your policy to the autopolicy
 
 Next, you need to register your policy in the `colossalai/shardformer/policies/autopolicy.py` file.
diff --git a/colossalai/shardformer/model/modeling_bert.py b/colossalai/shardformer/model/modeling_bert.py
deleted file mode 100644
index bd07ab80c00d..000000000000
--- a/colossalai/shardformer/model/modeling_bert.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from typing import Any, Dict, List, Type
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-from transformers import BertForMaskedLM
-from transformers.models.bert.modeling_bert import MaskedLMOutput
-
-from ..layer.dist_crossentropy import applyDistCrossEntropy
-
-
-class BertForMaskedLM_(BertForMaskedLM):
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        # print("[Inject OK] Injected forward method")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-
-        if labels is not None:
-            masked_lm_loss = applyDistCrossEntropy(prediction_scores, labels)
-        # if labels is not None:
-        #     loss_fct = CrossEntropyLoss()    # -100 index = padding token
-        #     masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores,) + outputs[2:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/colossalai/shardformer/model/__init__.py b/colossalai/shardformer/modeling/__init__.py
similarity index 100%
rename from colossalai/shardformer/model/__init__.py
rename to colossalai/shardformer/modeling/__init__.py
diff --git a/colossalai/shardformer/modeling/bloom.py b/colossalai/shardformer/modeling/bloom.py
new file mode 100644
index 000000000000..a3d774ff2abb
--- /dev/null
+++ b/colossalai/shardformer/modeling/bloom.py
@@ -0,0 +1,69 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+def build_bloom_alibi_tensor_fn(process_group: ProcessGroup) -> torch.Tensor:
+
+    def build_bloom_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int,
+                                 dtype: torch.dtype) -> torch.Tensor:
+        """
+        Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+        relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+        `softmax(l+a) = softmax(l)`. Based on
+        https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+        TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+        Args:
+        Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+            attention_mask (`torch.Tensor`):
+                Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+            num_heads (`int`, *required*):
+                number of heads
+            dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+                dtype of the output tensor
+        """
+        import math
+
+        if dist.is_initialized():
+            world_size = dist.get_world_size(process_group)
+            num_heads = num_heads * world_size
+
+        batch_size, seq_length = attention_mask.shape
+        closest_power_of_2 = 2**math.floor(math.log2(num_heads))
+        base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+                            device=attention_mask.device,
+                            dtype=torch.float32)
+        powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.pow(base, powers)
+
+        if closest_power_of_2 != num_heads:
+            extra_base = torch.tensor(2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+                                      device=attention_mask.device,
+                                      dtype=torch.float32)
+            num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+            extra_powers = torch.arange(1,
+                                        1 + 2 * num_remaining_heads,
+                                        2,
+                                        device=attention_mask.device,
+                                        dtype=torch.int32)
+            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+        # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+        # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+        # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+        # => the query_length dimension will then be broadcasted correctly
+        # This is more or less identical to T5's relative position bias:
+        # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+        arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+        alibi = slopes[..., None] * arange_tensor
+        if dist.is_initialized():
+            num_heads_per_rank = int(num_heads / dist.get_world_size(process_group))
+            offset = dist.get_rank(process_group) * num_heads_per_rank
+            alibi = alibi.view(batch_size, num_heads, 1, seq_length)
+            alibi = alibi[:, offset:num_heads_per_rank + offset, :, :]
+            return alibi.reshape(batch_size * num_heads_per_rank, 1, seq_length).to(dtype)
+        else:
+            return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+    return build_bloom_alibi_tensor
diff --git a/colossalai/shardformer/policies/bloom.py b/colossalai/shardformer/policies/bloom.py
index 030774a919d7..a0b5340f72bc 100644
--- a/colossalai/shardformer/policies/bloom.py
+++ b/colossalai/shardformer/policies/bloom.py
@@ -1,70 +1,12 @@
-import torch
-import torch.distributed as dist
 import torch.nn as nn
 
 import colossalai.shardformer.layer as col_nn
 
 from .._utils import getattr_, setattr_
+from ..modeling.bloom import build_bloom_alibi_tensor_fn
 from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 
-def build_bloom_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
-    """
-    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-    `softmax(l+a) = softmax(l)`. Based on
-    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-
-    Args:
-    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-        attention_mask (`torch.Tensor`):
-            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
-            number of heads
-        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
-            dtype of the output tensor
-    """
-    import math
-
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-        num_heads = num_heads * world_size
-
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2**math.floor(math.log2(num_heads))
-    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-                        device=attention_mask.device,
-                        dtype=torch.float32)
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-                                  device=attention_mask.device,
-                                  dtype=torch.float32)
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    alibi = slopes[..., None] * arange_tensor
-    if dist.is_initialized():
-        num_heads_per_rank = int(num_heads / dist.get_world_size())
-        offset = dist.get_rank() * num_heads_per_rank
-        alibi = alibi.view(batch_size, num_heads, 1, seq_length)
-        alibi = alibi[:, offset:num_heads_per_rank + offset, :, :]
-        return alibi.reshape(batch_size * num_heads_per_rank, 1, seq_length).to(dtype)
-    else:
-        return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-
-
 class BloomPolicy(Policy):
 
     def config_sanity_check(self):
@@ -120,7 +62,9 @@ def module_policy(self):
                 attribute_replacement={
                     "num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size,
                 },
-                method_replacement={"build_alibi_tensor": build_bloom_alibi_tensor},
+                method_replacement={
+                    "build_alibi_tensor": build_bloom_alibi_tensor_fn(self.shard_config.tensor_parallel_process_group)
+                },
                 sub_module_replacement=[
                     SubModuleReplacementDescription(
                         suffix="word_embeddings",