From 4403382e70a61777a4b28c4b7521a8d5083d3fbc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 30 Sep 2024 16:03:39 +0200
Subject: [PATCH 01/39] fix converter for function definitions

---
 utils/modular_model_converter.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index c5bf769f9288..63967309c40d 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -537,7 +537,7 @@ def __init__(self, python_module, new_name, given_old_name=None, given_new_name=
             "feature_extractor": {},
         }
         self.match_patterns = "|".join(self.files.keys())
-        self.all_functions = {}
+        self.all_definitions = {}
 
     def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
         """When visiting imports from `transformers.models.xxx` we need to:
@@ -590,6 +590,11 @@ def leave_SimpleStatementLine(self, original_node, updated_node):
             self.global_scope_index += 100
         return updated_node
 
+    @m.visit(m.ClassDef() | m.FunctionDef() | m.AddAssign())
+    def create_global_node(self, node):
+        name = re.search(r"(?:def|class)\s+([a-zA-Z_]\w*)|([A-Z_]\w*)", self.python_module.code_for_node(node))
+        self.all_nodes[name] = node
+
     def leave_ClassDef(self, original_node, updated_node):
         """
         1. Filter the `base` classes of this class
@@ -647,9 +652,12 @@ def leave_ClassDef(self, original_node, updated_node):
                 node = class_finder.global_nodes.get(dependency, None)
                 if node is not None:
                     if dependency not in file_to_update:
+                        node = self.all_definitions.get(dependency, node)
                         start_insert_idx -= 1
                         file_to_update[dependency] = {"insert_idx": start_insert_idx, "node": node}
                     elif dependency not in self.inserted_deps:
+                        print("processing :", dependency)
+
                         # make sure the node is written after its dependencies
                         start_insert_idx = file_to_update[dependency]["insert_idx"] - 1
                         if (
@@ -683,6 +691,12 @@ def leave_ClassDef(self, original_node, updated_node):
             self.files["modeling"][class_name] = {"insert_idx": self.global_scope_index, "node": updated_node}
         return updated_node
 
+    def leave_FunctionDef(self, original_node, node):
+        parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
+        if m.matches(parent_node, m.Module()):
+            self.all_definitions[node.name.value] = node
+        return node
+
     def leave_If(self, original_node, node):
         parent_node = self.get_metadata(cst.metadata.ParentNodeProvider, original_node)
         if m.matches(parent_node, m.Module()):
@@ -757,7 +771,7 @@ def save_modeling_file(modular_file, converted_file):
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--files_to_parse",
-        default=["all"],
+        default=["src/transformers/models/glm/modular_glm.py"],
         nargs="+",
         help="A list of `modular_xxxx` files that should be converted to single model file",
     )

From 18b2c0ca697c2858672286fc0af02774915ea57a Mon Sep 17 00:00:00 2001
From: mobicham <37179323+mobicham@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:47:18 +0200
Subject: [PATCH 02/39] Hqq serialization (#33141)

* HQQ model serialization attempt

* fix hqq dispatch and unexpected keys

* style

* remove check_old_param

* revert to check HQQLinear in quantizer_hqq.py

* revert to check HQQLinear in quantizer_hqq.py

* update HqqConfig default params

* make ci happy

* make ci happy

* revert to HQQLinear check in quantizer_hqq.py

* check hqq_min version 0.2.0

* set axis=1 as default in quantization_config.py

* validate_env with hqq>=0.2.0 version message

* deprecated hqq kwargs message

* make ci happy

* remove run_expected_keys_check hack + bump to 0.2.1 min hqq version

* fix unexpected_keys hqq update

* add pre_quantized check

* add update_expected_keys to base quantizerr

* ci base.py fix?

* ci base.py fix?

* fix "quantization typo" src/transformers/utils/quantization_config.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix post merge

---------

Co-authored-by: Marc Sun <marc@huggingface.co>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 docs/source/en/quantization/hqq.md            |   6 +-
 src/transformers/integrations/hqq.py          |  12 +-
 src/transformers/modeling_utils.py            |  11 +-
 src/transformers/quantizers/base.py           |  12 ++
 src/transformers/quantizers/quantizer_hqq.py  | 113 ++++++++++++++++--
 src/transformers/utils/import_utils.py        |   7 +-
 src/transformers/utils/quantization_config.py |  42 ++++---
 tests/quantization/hqq/test_hqq.py            |  71 ++++++-----
 8 files changed, 214 insertions(+), 60 deletions(-)
 mode change 100644 => 100755 docs/source/en/quantization/hqq.md
 mode change 100644 => 100755 src/transformers/quantizers/base.py

diff --git a/docs/source/en/quantization/hqq.md b/docs/source/en/quantization/hqq.md
old mode 100644
new mode 100755
index 11489808aecb..34608cd64fd8
--- a/docs/source/en/quantization/hqq.md
+++ b/docs/source/en/quantization/hqq.md
@@ -30,13 +30,13 @@ To quantize a model, you need to create an [`HqqConfig`]. There are two ways of
 from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
 
 # Method 1: all linear layers will use the same quantization config
-quant_config  = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) #axis=0 is used by default
+quant_config  = HqqConfig(nbits=8, group_size=64)
 ```
 
 ``` Python
 # Method 2: each linear layer with the same tag will use a dedicated quantization config
-q4_config = {'nbits':4, 'group_size':64, 'quant_zero':False, 'quant_scale':False}
-q3_config = {'nbits':3, 'group_size':32, 'quant_zero':False, 'quant_scale':False}
+q4_config = {'nbits':4, 'group_size':64}
+q3_config = {'nbits':3, 'group_size':32}
 quant_config  = HqqConfig(dynamic_config={
   'self_attn.q_proj':q4_config,
   'self_attn.k_proj':q4_config,
diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
index 10a6d06a3f9f..162b365668a0 100755
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@@ -66,6 +66,10 @@ def _prepare_for_hqq_linear(model, patch_params, has_been_replaced, current_key_
 
             has_been_replaced = True
 
+            # Add these fake parameters to avoid loading fail
+            for att in ["W_q", "meta"]:
+                setattr(module, att, None)
+
         if len(list(module.children())) > 0:
             _, has_been_replaced = _prepare_for_hqq_linear(
                 module,
@@ -97,7 +101,7 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
 
     # Convert quantization_config to layer-wise config
     skip_modules = quantization_config.skip_modules
-    quant_config = quantization_config.to_dict()
+    quant_config = quantization_config.quant_config
     linear_tags = list(set(linear_tags) - set(skip_modules) - set(modules_to_not_convert))
 
     if any(key in linear_tags for key in quant_config.keys()):
@@ -113,7 +117,11 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
     )
 
     # We store quantization config as linear_tag -> hqq quant config
-    model.config.quantization_config = patch_params
+    model.config.quantization_config = {
+        "quant_config": quant_config,
+        "quant_method": quantization_config.quant_method,
+        "skip_modules": skip_modules,
+    }
 
     if not has_been_replaced:
         logger.warning("No linear modules were found in your model for quantization.")
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index fc0d6748cd1d..df0519566766 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -934,12 +934,17 @@ def _load_state_dict_into_meta_model(
         # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model, and which
         # uses `param.copy_(input_param)` that preserves the contiguity of the parameter in the model.
         # Reference: https://github.com/pytorch/pytorch/blob/db79ceb110f6646523019a59bbd7b838f43d4a86/torch/nn/modules/module.py#L2040C29-L2040C29
+
         old_param = model
         splits = param_name.split(".")
         for split in splits:
             old_param = getattr(old_param, split)
+            # Not all the attributes of a module are Parameters/Tensor
+            if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)):
+                old_param = None
             if old_param is None:
                 break
+
         if old_param is not None:
             if dtype is None:
                 param = param.to(old_param.dtype)
@@ -3819,6 +3824,7 @@ def from_pretrained(
         from_pt = not (from_tf | from_flax)
 
         # load pt weights early so that we know which dtype to init the model under
+
         if from_pt:
             if not is_sharded and state_dict is None:
                 # Time to load the checkpoint
@@ -4176,6 +4182,9 @@ def _load_pretrained_model(
         expected_keys = list(model_state_dict.keys())
         prefix = model.base_model_prefix
 
+        if hf_quantizer is not None:
+            expected_keys = hf_quantizer.update_expected_keys(model, expected_keys, loaded_keys)
+
         def _fix_key(key):
             if "beta" in key:
                 return key.replace("beta", "bias")
@@ -4290,7 +4299,7 @@ def _fix_key(key):
                     value = torch.empty(*param.size(), dtype=target_dtype)
                     if (
                         not is_quantized
-                        or getattr(hf_quantizer, "requires_parameters_quantization", False)
+                        or (getattr(hf_quantizer, "requires_parameters_quantization", False))
                         or not hf_quantizer.check_quantized_param(
                             model, param_value=value, param_name=key, state_dict={}
                         )
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
old mode 100644
new mode 100755
index 73b3dbd8b259..015c0015cf7e
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -109,6 +109,18 @@ def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> Li
         """
         return missing_keys
 
+    def update_expected_keys(self, model, expected_keys: List[str], loaded_keys: List[str]) -> List[str]:
+        """
+        Override this method if you want to adjust the `update_expected_keys`.
+
+        Args:
+            expected_keys (`List[str]`, *optional*):
+                The list of the expected keys in the initialized model.
+            loaded_keys (`List[str]`, *optional*):
+                The list of the loaded keys in the checkpoint.
+        """
+        return expected_keys
+
     def get_special_dtypes_update(self, model, torch_dtype: "torch.dtype") -> Dict[str, "torch.dtype"]:
         """
         returns dtypes for modules that are not quantized - used for the computation of the device_map in case
diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py
index cd32a99c00ac..775fea8f4901 100755
--- a/src/transformers/quantizers/quantizer_hqq.py
+++ b/src/transformers/quantizers/quantizer_hqq.py
@@ -62,7 +62,7 @@ def __init__(self, quantization_config, **kwargs):
     def validate_environment(self, *args, **kwargs):
         if not (is_hqq_available()):
             raise ImportError(
-                "HQQ is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`"
+                "A valid HQQ version (>=0.2.1) is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`."
             )
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
@@ -91,6 +91,65 @@ def validate_environment(self, *args, **kwargs):
             else:
                 self.using_multi_gpu = len(set(device_map.values())) > 1
 
+    def update_missing_keys(
+        self, model: "PreTrainedModel", missing_keys: List[str], prefix: str, **kwargs
+    ) -> List[str]:
+        if self.pre_quantized:
+            return [key for key in missing_keys if ("weight" not in key)]
+        else:
+            return missing_keys
+
+    # Adds missing keys for HQQLinear modules that are loaded but the model with initialized with torch.nn.Linear
+    def update_expected_keys(
+        self, model: "PreTrainedModel", expected_keys: List[str], loaded_keys: List[str]
+    ) -> List[str]:
+        if not self.pre_quantized:
+            return expected_keys
+
+        # Collects all quantizable (linear) layers
+        def _find_hqq_quantizable_layers(model, layers):
+            for name, module in model.named_children():
+                if isinstance(module, (torch.nn.Linear)):
+                    layers.add(module.name)
+                _find_hqq_quantizable_layers(module, layers)
+
+        new_keys = set(expected_keys)
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
+
+            # Name modules
+            for name, module in model.named_modules():
+                module.name = name
+
+            # valid modules are Linear layers that have HQQLinear state_dict. We ignore skip_modules and any layers with Linear state_dict() params
+            _valid_modules = set()
+            _find_hqq_quantizable_layers(model, _valid_modules)
+            _valid_modules -= set(model.config.quantization_config["skip_modules"])
+
+            # Append new expected layers based on _ref_keys
+            _ref_keys = HQQLinear(
+                linear_layer=None, quant_config=None, compute_dtype=torch.float16, device="cpu"
+            ).state_dict_keys() - {"bias"}
+
+            # Clean-up
+            _rm_keys = set()
+            for key in new_keys:
+                if any(_module in key for _module in _valid_modules):
+                    _rm_keys.add(key)
+            new_keys -= _rm_keys
+            # At this point, new_keys contains all the keys of the layers that are NOT HQQLinear or torch.nn.Linear
+
+            # Re-populate Linear/HQQLinear
+            for _module in _valid_modules:
+                if _module + ".weight" in loaded_keys:
+                    new_keys.add(_module + ".weight")
+                else:
+                    new_keys.update({_module + "." + _ref_key for _ref_key in _ref_keys})
+                if _module + ".bias" in loaded_keys:
+                    new_keys.add(_module + ".bias")
+
+        return list(new_keys)
+
     def check_quantized_param(
         self,
         model: "PreTrainedModel",
@@ -99,9 +158,18 @@ def check_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ) -> bool:
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
         module, tensor_name = get_module_from_name(model, param_name)
 
-        return isinstance(module, torch.nn.Linear) and (tensor_name == "weight")
+        if self.pre_quantized:
+            return (
+                (isinstance(module, torch.nn.Linear) or isinstance(module, HQQLinear))
+                and tensor_name != "weight"
+                and tensor_name != "bias"
+            )
+        else:
+            return isinstance(module, torch.nn.Linear) and tensor_name == "weight"
 
     def create_quantized_param(
         self,
@@ -122,13 +190,43 @@ def create_quantized_param(
             from hqq.core.quantize import HQQLinear
 
         module, tensor_name = get_module_from_name(model, param_name)
-
-        layer_name = param_name.replace(".weight", "").replace(".bias", "")
+        layer_name = ".".join(param_name.split(".")[:-1])
         parent_module = find_parent(model, layer_name)
         node = layer_name.split(".")[-1]
 
-        # Step 0: set module state_dict
-        module_state_dict = {key.split(".")[-1]: state_dict[key] for key in state_dict if layer_name in key}
+        # set module state_dict
+        module_state_dict = {}
+        for k, v in state_dict.items():
+            if layer_name + "." in k:
+                module_state_dict[k.split(".")[-1]] = v
+                if unexpected_keys is not None and k in unexpected_keys:
+                    unexpected_keys.remove(k)
+
+        if self.pre_quantized:
+            if isinstance(module, HQQLinear):
+                return
+            else:
+                hqq_layer = HQQLinear(
+                    linear_layer=None,
+                    quant_config=None,
+                    compute_dtype=self.torch_dtype,
+                    device=target_device,
+                )
+
+            hqq_layer.load_state_dict(module_state_dict)
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            setattr(parent_module, node, hqq_layer)
+
+            # cleanup
+            del module.__dict__, module
+            torch.cuda.empty_cache()
+            return
 
         # Step 1: populate module with weight/bias from module state dict
         for key in module_state_dict:
@@ -136,7 +234,6 @@ def create_quantized_param(
 
         # Step 2: Replace module with either HQQLinear or move it to device. We do this via setattr on the parent as doing on it on the module
         # directly doesn't work.
-
         if hasattr(module, "quant_config"):
             hqq_layer = HQQLinear(
                 module,
@@ -192,7 +289,7 @@ def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs
         return model
 
     def is_serializable(self, safe_serialization=None):
-        return False
+        return True
 
     @property
     def is_trainable(self) -> bool:
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 169d3491053e..a98b17e4bd57 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -92,6 +92,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 FSDP_MIN_VERSION = "1.12.0"
 GGUF_MIN_VERSION = "0.10.0"
 XLA_FSDPV2_MIN_VERSION = "2.2.0"
+HQQ_MIN_VERSION = "0.2.1"
 
 
 _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
@@ -181,7 +182,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _torchdistx_available = _is_package_available("torchdistx")
 _torchvision_available = _is_package_available("torchvision")
 _mlx_available = _is_package_available("mlx")
-_hqq_available = _is_package_available("hqq")
+_hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
 _tiktoken_available = _is_package_available("tiktoken")
 _blobfile_available = _is_package_available("blobfile")
 _liger_kernel_available = _is_package_available("liger_kernel")
@@ -323,8 +324,8 @@ def is_torch_deterministic():
         return True
 
 
-def is_hqq_available():
-    return _hqq_available
+def is_hqq_available(min_version: str = HQQ_MIN_VERSION):
+    return _hqq_available and version.parse(_hqq_version) >= version.parse(min_version)
 
 
 def is_pygments_available():
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 19166f9ed92a..8be0bb672e51 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -193,15 +193,9 @@ class HqqConfig(QuantizationConfigMixin):
             Number of bits. Supported values are (8, 4, 3, 2, 1).
         group_size (`int`, *optional*, defaults to 64):
             Group-size value. Supported values are any value that is divisble by weight.shape[axis]).
-        quant_zero (`bool`, *optional*, defaults to `True`):
-            Quantize the zero-point if set to `True`.
-        quant_scale (`bool`, *optional*, defaults to `False`):
-            Quantize the scaling if set to `True`.
-        offload_meta (`bool`, *optional*, defaults to `False`):
-            Offload the meta-data to the CPU if set to `True`.
         view_as_float (`bool`, *optional*, defaults to `False`):
             View the quantized weight as float (used in distributed training) if set to `True`.
-        axis (`int`, *optional*, defaults to 0):
+        axis (`Optional[int]`, *optional*):
             Axis along which grouping is performed. Supported values are 0 or 1.
         dynamic_config (dict, *optional*):
             Parameters for dynamic configuration. The key is the name tag of the layer and the value is a quantization config.
@@ -216,11 +210,8 @@ def __init__(
         self,
         nbits: int = 4,
         group_size: int = 64,
-        quant_zero: bool = True,
-        quant_scale: bool = False,
-        offload_meta: bool = False,
         view_as_float: bool = False,
-        axis: int = 0,
+        axis: Optional[int] = None,
         dynamic_config: Optional[dict] = None,
         skip_modules: List[str] = ["lm_head"],
         **kwargs,
@@ -228,6 +219,16 @@ def __init__(
         if is_hqq_available():
             from hqq.core.quantize import BaseQuantizeConfig as HQQBaseQuantizeConfig
 
+        for deprecated_key in ["quant_zero", "quant_scale", "offload_meta"]:
+            if deprecated_key in kwargs:
+                logger.info(
+                    deprecated_key + " is deprecated. This parameter will be ignored in quantization settings."
+                )
+
+        if axis is None:
+            axis = 1
+            logger.info("Setting axis=1 as faster backends such as TorchAO or BitBlas are only compatible with it.")
+
         if axis not in [0, 1]:
             raise ValueError("Invalid axis value. Only 0 and 1 are allowed.")
 
@@ -240,9 +241,6 @@ def __init__(
                 **{
                     "nbits": nbits,
                     "group_size": group_size,
-                    "quant_zero": quant_zero,
-                    "quant_scale": quant_scale,
-                    "offload_meta": offload_meta,
                     "view_as_float": view_as_float,
                     "axis": axis,
                 }
@@ -259,12 +257,26 @@ def post_init(self):
         """
         pass
 
+    @classmethod
+    def from_dict(cls, config: Dict[str, Any]):
+        """
+        Override from_dict, used in AutoQuantizationConfig.from_dict in quantizers/auto.py
+        """
+        instance = cls()
+        instance.quant_config = config["quant_config"]
+        instance.skip_modules = config["skip_modules"]
+        return instance
+
     def to_dict(self) -> Dict[str, Any]:
         """
         Serializes this instance to a Python dictionary. Returns:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
-        return self.quant_config
+        return {
+            "quant_config": self.quant_config,
+            "quant_method": self.quant_method,
+            "skip_modules": self.skip_modules,
+        }
 
     def __repr__(self):
         config_dict = self.to_dict()
diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py
index 45c64676a7e4..6d08a0f0e669 100755
--- a/tests/quantization/hqq/test_hqq.py
+++ b/tests/quantization/hqq/test_hqq.py
@@ -94,8 +94,7 @@ def test_to_dict(self):
         quantization_config = HqqConfig()
         hqq_orig_config = quantization_config.to_dict()
 
-        for key in hqq_orig_config:
-            self.assertEqual(quantization_config.quant_config[key], hqq_orig_config[key])
+        self.assertEqual(quantization_config.quant_config, hqq_orig_config["quant_config"])
 
 
 @slow
@@ -109,7 +108,7 @@ def test_fp16_quantized_model(self):
         """
         Simple LLM model testing fp16
         """
-        quant_config = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0)
+        quant_config = HqqConfig(nbits=8, group_size=64)
 
         hqq_runner = HQQLLMRunner(
             model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
@@ -118,26 +117,24 @@ def test_fp16_quantized_model(self):
         check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
         check_forward(self, hqq_runner.model)
 
-    def test_f16_quantized_model_with_offloading(self):
+
+@slow
+@require_torch_gpu
+@require_torch_multi_gpu
+@require_accelerate
+class HQQTestMultiGPU(unittest.TestCase):
+    def tearDown(self):
+        cleanup()
+
+    def test_fp16_quantized_model_multipgpu(self):
         """
-        Simple LLM model testing bfp16 with meta-data offloading
+        Simple LLM model testing fp16 with multi-gpu
         """
-        q4_config = {"nbits": 4, "group_size": 64, "quant_zero": False, "quant_scale": False}
-        q3_config = {"nbits": 3, "group_size": 32, "quant_zero": False, "quant_scale": False, "offload_meta": True}
-        quant_config = HqqConfig(
-            dynamic_config={
-                "self_attn.q_proj": q4_config,
-                "self_attn.k_proj": q4_config,
-                "self_attn.v_proj": q4_config,
-                "self_attn.o_proj": q4_config,
-                "mlp.gate_proj": q3_config,
-                "mlp.up_proj": q3_config,
-                "mlp.down_proj": q3_config,
-            }
-        )
+
+        quant_config = HqqConfig(nbits=8, group_size=64)
 
         hqq_runner = HQQLLMRunner(
-            model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
+            model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device="auto"
         )
 
         check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
@@ -146,22 +143,40 @@ def test_f16_quantized_model_with_offloading(self):
 
 @slow
 @require_torch_gpu
-@require_torch_multi_gpu
 @require_accelerate
-class HQQTestMultiGPU(unittest.TestCase):
+class HQQSerializationTest(unittest.TestCase):
     def tearDown(self):
         cleanup()
 
-    def test_fp16_quantized_model_multipgpu(self):
+    def test_model_serialization(self):
         """
-        Simple LLM model testing fp16 with multi-gpu
+        Simple HQQ LLM save/load test
         """
-
-        quant_config = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0)
+        quant_config = HqqConfig(nbits=4, group_size=64)
 
         hqq_runner = HQQLLMRunner(
-            model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device="auto"
+            model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
         )
 
-        check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
-        check_forward(self, hqq_runner.model)
+        input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
+
+        with torch.no_grad():
+            logits_ref = hqq_runner.model.forward(input_tensor).logits
+
+        # Save
+        saved_model_id = "quant_model"
+        hqq_runner.model.save_pretrained(saved_model_id)
+
+        # Remove old model
+        del hqq_runner.model
+        torch.cuda.empty_cache()
+
+        # Load and check if the logits match
+        model_loaded = AutoModelForCausalLM.from_pretrained(
+            "quant_model", torch_dtype=torch.float16, device_map=torch_device, low_cpu_mem_usage=True
+        )
+
+        with torch.no_grad():
+            logits_loaded = model_loaded.forward(input_tensor).logits
+
+        self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)

From 30400c09b2c2e061a98ed571f194218e8aff3508 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 24 Sep 2024 18:30:19 +0200
Subject: [PATCH 03/39] Create modular_glm.py

---
 src/transformers/models/glm/modular_glm.py | 223 +++++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 src/transformers/models/glm/modular_glm.py

diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
new file mode 100644
index 000000000000..e2e0b6fd0696
--- /dev/null
+++ b/src/transformers/models/glm/modular_glm.py
@@ -0,0 +1,223 @@
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+import math
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, HybridCache
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from ...utils import (
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torchdynamo_compiling,
+    logging,
+)
+from ..llama.modeling_llama import (
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from ..phi3.modeling_phi3 import (
+    Phi3MLP,
+    Phi3Attention,
+)
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+
+class GlmConfig(PretrainedConfig):
+    model_type = "gemma"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=13696,
+        num_hidden_layers=40,
+        num_attention_heads=32,
+
+
+        num_key_value_heads=2, # ??
+        head_dim=128,
+
+
+        hidden_act="silu",
+        rope_type="linear",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        # pad_token_id=0,
+        # eos_token_id=1,
+        # bos_token_id=2,
+        # tie_word_embeddings=True,
+        rope_theta=10000.0,
+        attention_bias=True,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.hidden_activation = hidden_activation
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+
+
+class GlmRMSNorm(LlamaRMSNorm):
+    pass
+
+
+# Need config.rope_type: linear
+class GlmRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class GlmMLP(Phi3MLP):
+    pass
+
+
+class GlmAttention(Phi3Attention):
+
+    def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_bias = config.attention_bias
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=self.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.")
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights += causal_mask
+
+        # MAYBE upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
\ No newline at end of file

From 5677a55870f81fd4e85300384e881b9b18347017 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 25 Sep 2024 10:52:17 +0200
Subject: [PATCH 04/39] Update modular_glm.py

---
 src/transformers/models/glm/modular_glm.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index e2e0b6fd0696..13ee0f3d3f03 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -220,4 +220,10 @@ def forward(
         if not output_attentions:
             attn_weights = None
 
-        return attn_output, attn_weights, past_key_value
\ No newline at end of file
+        return attn_output, attn_weights, past_key_value
+    
+
+
+GLM_ATTENTION_CLASSES = {
+"eager": GlmAttention,
+}
\ No newline at end of file

From c4c9f5c4966e31295a7156b4f118906b7fd95f22 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 25 Sep 2024 13:25:50 +0200
Subject: [PATCH 05/39] Finalize architecture without all attentions

---
 src/transformers/models/glm/modular_glm.py | 182 +++++++++++++++++----
 1 file changed, 148 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 13ee0f3d3f03..106760b8d060 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -38,12 +38,16 @@
 from ..llama.modeling_llama import (
     LlamaRMSNorm,
     LlamaRotaryEmbedding,
+    LlamaModel,
+    LlamaForCausalLM,
     apply_rotary_pos_emb,
     repeat_kv,
 )
 from ..phi3.modeling_phi3 import (
+    Phi3Config,
     Phi3MLP,
     Phi3Attention,
+    Phi3DecoderLayer
 )
 
 
@@ -54,62 +58,65 @@
 logger = logging.get_logger(__name__)
 
 
-class GlmConfig(PretrainedConfig):
-    model_type = "gemma"
-    keys_to_ignore_at_inference = ["past_key_values"]
+class GlmConfig(Phi3Config):
+    model_type = "glm"
 
     def __init__(
         self,
+        # Phi3 args
         vocab_size=151552,
         hidden_size=4096,
         intermediate_size=13696,
         num_hidden_layers=40,
         num_attention_heads=32,
-
-
-        num_key_value_heads=2, # ??
-        head_dim=128,
-
-
+        num_key_value_heads=2,
+        resid_pdrop=0.0,
+        attention_dropout=0.0,
         hidden_act="silu",
-        rope_type="linear",
         max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=1e-5,
         use_cache=True,
-        # pad_token_id=0,
-        # eos_token_id=1,
-        # bos_token_id=2,
-        # tie_word_embeddings=True,
+        tie_word_embeddings=False,
         rope_theta=10000.0,
+        rope_scaling={"rope_type": "linear", "factor": 1.,},
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        # Additionnal args
+        head_dim=128,
+        partial_rotary_factor=0.5,
         attention_bias=True,
-        attention_dropout=0.0,
         **kwargs,
     ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.head_dim = head_dim
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.hidden_activation = hidden_activation
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-
         super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            resid_pdrop=resid_pdrop,
+            attention_dropout=attention_dropout,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            use_cache=use_cache,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+        self.head_dim = head_dim
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_bias = attention_bias
+        del self.embd_pdrop
+        del self.original_max_mposition_embeddings
+        del self.sliding_window
 
 
 
@@ -226,4 +233,111 @@ def forward(
 
 GLM_ATTENTION_CLASSES = {
 "eager": GlmAttention,
-}
\ No newline at end of file
+}
+
+
+class GlmDecoderLayer(Phi3DecoderLayer):
+
+    def __init__(self, config: GlmConfig, layer_idx: int):
+        super().__init__()
+
+        self.config = config
+        self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+
+        self.mlp = GlmMLP(config)
+        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings
+        )
+
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+    
+
+class GlmModel(LlamaModel):
+
+    def __init__(self, config: GlmConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = GlmRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class GlmForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GlmModel(config)
+        self.post_init()
\ No newline at end of file

From 644300433c26ea7989012bd8838ae2e1e4aaa0aa Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 25 Sep 2024 15:51:20 +0200
Subject: [PATCH 06/39] Add all attentions modules

---
 src/transformers/models/glm/modular_glm.py | 331 ++++++++++++++++++---
 1 file changed, 291 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 106760b8d060..9be2099ce094 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -13,26 +13,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 import math
 
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
-from torch.nn import CrossEntropyLoss
 
-from ...activations import ACT2FN
-from ...cache_utils import Cache, HybridCache
+from ...cache_utils import Cache
+
 from ...configuration_utils import PretrainedConfig
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
+
 from ...utils import (
     is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal,
     is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
 )
 from ..llama.modeling_llama import (
@@ -44,9 +38,7 @@
     repeat_kv,
 )
 from ..phi3.modeling_phi3 import (
-    Phi3Config,
     Phi3MLP,
-    Phi3Attention,
     Phi3DecoderLayer
 )
 
@@ -58,7 +50,68 @@
 logger = logging.get_logger(__name__)
 
 
-class GlmConfig(Phi3Config):
+# class GlmConfig(Phi3Config):
+#     model_type = "glm"
+
+#     def __init__(
+#         self,
+#         # Phi3 args
+#         vocab_size=151552,
+#         hidden_size=4096,
+#         intermediate_size=13696,
+#         num_hidden_layers=40,
+#         num_attention_heads=32,
+#         num_key_value_heads=2,
+#         resid_pdrop=0.0,
+#         attention_dropout=0.0,
+#         hidden_act="silu",
+#         max_position_embeddings=131072,
+#         initializer_range=0.02,
+#         rms_norm_eps=1e-5,
+#         use_cache=True,
+#         tie_word_embeddings=False,
+#         rope_theta=10000.0,
+#         rope_scaling={"rope_type": "linear", "factor": 1.,},
+#         pad_token_id=151329,
+#         eos_token_id=[151329, 151336, 151338],
+#         bos_token_id=None,
+#         # Additionnal args
+#         head_dim=128,
+#         partial_rotary_factor=0.5,
+#         attention_bias=True,
+#         **kwargs,
+#     ):
+#         super().__init__(
+#             vocab_size=vocab_size,
+#             hidden_size=hidden_size,
+#             intermediate_size=intermediate_size,
+#             num_hidden_layers=num_hidden_layers,
+#             num_attention_heads=num_attention_heads,
+#             num_key_value_heads=num_key_value_heads,
+#             resid_pdrop=resid_pdrop,
+#             attention_dropout=attention_dropout,
+#             hidden_act=hidden_act,
+#             max_position_embeddings=max_position_embeddings,
+#             initializer_range=initializer_range,
+#             rms_norm_eps=rms_norm_eps,
+#             use_cache=use_cache,
+#             tie_word_embeddings=tie_word_embeddings,
+#             rope_theta=rope_theta,
+#             rope_scaling=rope_scaling,
+#             pad_token_id=pad_token_id,
+#             bos_token_id=bos_token_id,
+#             eos_token_id=eos_token_id,
+#             **kwargs,
+#         )
+#         self.head_dim = head_dim
+#         self.partial_rotary_factor = partial_rotary_factor
+#         self.attention_bias = attention_bias
+#         del self.embd_pdrop
+#         del self.original_max_mposition_embeddings
+#         del self.sliding_window
+
+
+class GlmConfig(PretrainedConfig):
     model_type = "glm"
 
     def __init__(
@@ -80,9 +133,9 @@ def __init__(
         tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling={"rope_type": "linear", "factor": 1.,},
-        pad_token_id=0,
-        eos_token_id=1,
-        bos_token_id=2,
+        pad_token_id=151329,
+        eos_token_id=[151329, 151336, 151338],
+        bos_token_id=None,
         # Additionnal args
         head_dim=128,
         partial_rotary_factor=0.5,
@@ -90,34 +143,33 @@ def __init__(
         **kwargs,
     ):
         super().__init__(
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_hidden_layers=num_hidden_layers,
-            num_attention_heads=num_attention_heads,
-            num_key_value_heads=num_key_value_heads,
-            resid_pdrop=resid_pdrop,
-            attention_dropout=attention_dropout,
-            hidden_act=hidden_act,
-            max_position_embeddings=max_position_embeddings,
-            initializer_range=initializer_range,
-            rms_norm_eps=rms_norm_eps,
-            use_cache=use_cache,
             tie_word_embeddings=tie_word_embeddings,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             **kwargs,
         )
+        self.vocab_size = vocab_size
+        self.hidden_size=hidden_size
+        self.intermediate_size=intermediate_size
+        self.num_hidden_layers=num_hidden_layers
+        self.num_attention_heads=num_attention_heads
+        self.num_key_value_heads=num_key_value_heads
+        self.resid_pdrop=resid_pdrop
+        self.attention_dropout=attention_dropout
+        self.hidden_act=hidden_act
+        self.max_position_embeddings=max_position_embeddings
+        self.initializer_range=initializer_range
+        self.rms_norm_eps=rms_norm_eps
+        self.use_cache=use_cache
+        self.initializer_range=initializer_range
+        self.rms_norm_eps=rms_norm_eps
+        self.use_cache=use_cache
+        self.rope_theta=rope_theta
+        self.rope_scaling=rope_scaling
         self.head_dim = head_dim
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
-        del self.embd_pdrop
-        del self.original_max_mposition_embeddings
-        del self.sliding_window
-
 
 
 
@@ -134,7 +186,7 @@ class GlmMLP(Phi3MLP):
     pass
 
 
-class GlmAttention(Phi3Attention):
+class GlmAttention(nn.Module):
 
     def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
         super().__init__()
@@ -170,6 +222,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
@@ -207,7 +260,7 @@ def forward(
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights += causal_mask
 
-        # MAYBE upcast attention to fp32
+        # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
 
@@ -230,9 +283,207 @@ def forward(
         return attn_output, attn_weights, past_key_value
     
 
+class GlmFlashAttention2(GlmAttention):
+    """
+    GLM flash attention module. This module inherits from `GlmAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_dropout = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.qkv_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=attn_dropout,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+    
+
+
+class GlmSdpaAttention(GlmAttention):
+    """
+    GLM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `GlmAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            logger.warning_once(
+                "GlmModel is using GlmSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+    
 
 GLM_ATTENTION_CLASSES = {
-"eager": GlmAttention,
+    "eager": GlmAttention,
+    "flash_attention_2": GlmFlashAttention2,
+    "sdpa": GlmSdpaAttention,
 }
 
 
@@ -318,7 +569,7 @@ def forward(
             outputs += (present_key_value,)
 
         return outputs
-    
+
 
 class GlmModel(LlamaModel):
 
@@ -340,4 +591,4 @@ class GlmForCausalLM(LlamaForCausalLM):
     def __init__(self, config):
         super().__init__(config)
         self.model = GlmModel(config)
-        self.post_init()
\ No newline at end of file
+        self.post_init()

From ff01996d43028c5b28c6b541eabb34dcc3a176cf Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 25 Sep 2024 16:43:47 +0200
Subject: [PATCH 07/39] Finalize modular

---
 .../models/glm/configuration_glm.py           |   86 +
 src/transformers/models/glm/modeling_glm.py   | 1473 +++++++++++++++++
 src/transformers/models/glm/modular_glm.py    |   97 +-
 3 files changed, 1585 insertions(+), 71 deletions(-)
 create mode 100644 src/transformers/models/glm/configuration_glm.py
 create mode 100644 src/transformers/models/glm/modeling_glm.py

diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
new file mode 100644
index 000000000000..5350cbe13f5b
--- /dev/null
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -0,0 +1,86 @@
+#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#               This file was automatically generated from <path_to_modular_file.py>.
+#         Do NOT edit this file manually as any edits will be overwritten by the generation of
+#         the file from the modular. If any change should be done, please apply the change to the
+#                           modular_xxx.py file directly. One of our CI enforces this
+#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+
+
+class GlmConfig(PretrainedConfig):
+    model_type = "glm"
+
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=4096,
+        intermediate_size=13696,
+        num_hidden_layers=40,
+        num_attention_heads=32,
+        num_key_value_heads=2,
+        resid_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling={
+            "rope_type": "linear",
+            "factor": 1.0,
+        },
+        pad_token_id=151329,
+        eos_token_id=[151329, 151336, 151338],
+        bos_token_id=None,
+        head_dim=128,
+        partial_rotary_factor=0.5,
+        attention_bias=True,
+        **kwargs,
+    ):
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.head_dim = head_dim
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_bias = attention_bias
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
new file mode 100644
index 000000000000..1c9d7da3fe59
--- /dev/null
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -0,0 +1,1473 @@
+#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#               This file was automatically generated from <path_to_modular_file.py>.
+#         Do NOT edit this file manually as any edits will be overwritten by the generation of
+#         the file from the modular. If any change should be done, please apply the change to the
+#                           modular_xxx.py file directly. One of our CI enforces this
+#           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_glm import GlmConfig
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+class GlmRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GlmRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+logger = logging.get_logger(__name__)
+
+
+class GlmRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[GlmConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`GlmRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class GlmLongRoPEScaledRotaryEmbedding(GlmRotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = seq_len or torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class GlmMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class GlmAttention(nn.Module):
+    def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_bias = config.attention_bias
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=self.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.")
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights += causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class GlmFlashAttention2(GlmAttention):
+    """
+    GLM flash attention module. This module inherits from `GlmAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_dropout = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.qkv_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=attn_dropout,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class GlmSdpaAttention(GlmAttention):
+    """
+    GLM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `GlmAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            logger.warning_once(
+                "GlmModel is using GlmSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+    return causal_mask
+
+
+GLM_ATTENTION_CLASSES = {
+    "eager": GlmAttention,
+    "flash_attention_2": GlmFlashAttention2,
+    "sdpa": GlmSdpaAttention,
+}
+
+
+class GlmDecoderLayer(nn.Module):
+    def __init__(self, config: GlmConfig, layer_idx: int):
+        super().__init__()
+
+        self.config = config
+        self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+
+        self.mlp = GlmMLP(config)
+        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+GLM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GlmConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Glm Model outputting raw hidden-states without any specific head on top.",
+    GLM_START_DOCSTRING,
+)
+class GlmPreTrainedModel(PreTrainedModel):
+    config_class = GlmConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GlmDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+_CONFIG_FOR_DOC = "GlmConfig"
+
+
+GLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Glm Model outputting raw hidden-states without any specific head on top.",
+    GLM_START_DOCSTRING,
+)
+class GlmModel(GlmPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GlmDecoderLayer`]
+
+    Args:
+        config: GlmConfig
+    """
+
+    def __init__(self, config: GlmConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = GlmRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            min_dtype=min_dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GlmModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GlmForCausalLM
+
+        >>> model = GlmForCausalLM.from_pretrained("google/glm-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/glm-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        if labels is None and not is_torchdynamo_compiling():
+            logger.warning_once(
+                "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
+            )
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        # TODO: remove the float() operation in v4.46
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            dtype = self.lm_head.weight.dtype
+            min_dtype = torch.finfo(dtype).min
+
+            attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_length(),
+                dtype=dtype,
+                device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+
+@add_start_docstrings(
+    """
+    The Glm Model transformer with a sequence classification head on top (linear layer).
+
+    [`GlmForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GLM_START_DOCSTRING,
+)
+class GlmForSequenceClassification(GlmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = GlmModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Glm Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    GLM_START_DOCSTRING,
+)
+class GlmForTokenClassification(GlmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = GlmModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 9be2099ce094..5687066e620a 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -13,17 +13,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
 import math
+from typing import Optional, Tuple
 
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
 
 from ...cache_utils import Cache
-
 from ...configuration_utils import PretrainedConfig
-
+from ...modeling_flash_attention_utils import _flash_attention_forward
 from ...utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
@@ -33,7 +32,6 @@
     LlamaRMSNorm,
     LlamaRotaryEmbedding,
     LlamaModel,
-    LlamaForCausalLM,
     apply_rotary_pos_emb,
     repeat_kv,
 )
@@ -41,6 +39,11 @@
     Phi3MLP,
     Phi3DecoderLayer
 )
+from ..gemma.modeling_gemma import (
+    GemmaForCausalLM,
+    GemmaForSequenceClassification,
+    GemmaForTokenClassification,
+)
 
 
 if is_flash_attn_2_available():
@@ -50,73 +53,11 @@
 logger = logging.get_logger(__name__)
 
 
-# class GlmConfig(Phi3Config):
-#     model_type = "glm"
-
-#     def __init__(
-#         self,
-#         # Phi3 args
-#         vocab_size=151552,
-#         hidden_size=4096,
-#         intermediate_size=13696,
-#         num_hidden_layers=40,
-#         num_attention_heads=32,
-#         num_key_value_heads=2,
-#         resid_pdrop=0.0,
-#         attention_dropout=0.0,
-#         hidden_act="silu",
-#         max_position_embeddings=131072,
-#         initializer_range=0.02,
-#         rms_norm_eps=1e-5,
-#         use_cache=True,
-#         tie_word_embeddings=False,
-#         rope_theta=10000.0,
-#         rope_scaling={"rope_type": "linear", "factor": 1.,},
-#         pad_token_id=151329,
-#         eos_token_id=[151329, 151336, 151338],
-#         bos_token_id=None,
-#         # Additionnal args
-#         head_dim=128,
-#         partial_rotary_factor=0.5,
-#         attention_bias=True,
-#         **kwargs,
-#     ):
-#         super().__init__(
-#             vocab_size=vocab_size,
-#             hidden_size=hidden_size,
-#             intermediate_size=intermediate_size,
-#             num_hidden_layers=num_hidden_layers,
-#             num_attention_heads=num_attention_heads,
-#             num_key_value_heads=num_key_value_heads,
-#             resid_pdrop=resid_pdrop,
-#             attention_dropout=attention_dropout,
-#             hidden_act=hidden_act,
-#             max_position_embeddings=max_position_embeddings,
-#             initializer_range=initializer_range,
-#             rms_norm_eps=rms_norm_eps,
-#             use_cache=use_cache,
-#             tie_word_embeddings=tie_word_embeddings,
-#             rope_theta=rope_theta,
-#             rope_scaling=rope_scaling,
-#             pad_token_id=pad_token_id,
-#             bos_token_id=bos_token_id,
-#             eos_token_id=eos_token_id,
-#             **kwargs,
-#         )
-#         self.head_dim = head_dim
-#         self.partial_rotary_factor = partial_rotary_factor
-#         self.attention_bias = attention_bias
-#         del self.embd_pdrop
-#         del self.original_max_mposition_embeddings
-#         del self.sliding_window
-
-
 class GlmConfig(PretrainedConfig):
     model_type = "glm"
 
     def __init__(
         self,
-        # Phi3 args
         vocab_size=151552,
         hidden_size=4096,
         intermediate_size=13696,
@@ -136,7 +77,6 @@ def __init__(
         pad_token_id=151329,
         eos_token_id=[151329, 151336, 151338],
         bos_token_id=None,
-        # Additionnal args
         head_dim=128,
         partial_rotary_factor=0.5,
         attention_bias=True,
@@ -177,7 +117,6 @@ class GlmRMSNorm(LlamaRMSNorm):
     pass
 
 
-# Need config.rope_type: linear
 class GlmRotaryEmbedding(LlamaRotaryEmbedding):
     pass
 
@@ -392,7 +331,6 @@ def forward(
         return attn_output, attn_weights, past_key_value
     
 
-
 class GlmSdpaAttention(GlmAttention):
     """
     GLM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -438,7 +376,7 @@ def forward(
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
@@ -546,6 +484,7 @@ def forward(
         attn_outputs, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
@@ -586,9 +525,25 @@ def __init__(self, config: GlmConfig):
         self.post_init()
 
 
-class GlmForCausalLM(LlamaForCausalLM):
+class GlmForCausalLM(GemmaForCausalLM):
 
     def __init__(self, config):
         super().__init__(config)
         self.model = GlmModel(config)
         self.post_init()
+
+
+class GlmForSequenceClassification(GemmaForSequenceClassification):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GlmModel(config)
+        self.post_init()
+
+
+class GlmForTokenClassification(GemmaForTokenClassification):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GlmModel(config)
+        self.post_init()
\ No newline at end of file

From 7584246b6a507e160d6a06798cc66e5feda0fa8f Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 25 Sep 2024 19:52:21 +0200
Subject: [PATCH 08/39] Update given last version

---
 .../models/glm/configuration_glm.py           | 11 ++-
 .../models/glm/convert_glm_weights_to_hf.py   | 97 +++++++++++++++++++
 src/transformers/models/glm/modeling_glm.py   | 39 +++++---
 src/transformers/models/glm/modular_glm.py    | 75 ++++++++------
 4 files changed, 175 insertions(+), 47 deletions(-)
 create mode 100644 src/transformers/models/glm/convert_glm_weights_to_hf.py

diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
index 5350cbe13f5b..d69e2c342a13 100644
--- a/src/transformers/models/glm/configuration_glm.py
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -29,7 +29,7 @@ class GlmConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=151552,
+        vocab_size=65024,
         hidden_size=4096,
         intermediate_size=13696,
         num_hidden_layers=40,
@@ -41,6 +41,9 @@ def __init__(
         max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=1e-5,
+        use_rms_norm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
@@ -54,6 +57,7 @@ def __init__(
         head_dim=128,
         partial_rotary_factor=0.5,
         attention_bias=True,
+        linear_bias=False,
         **kwargs,
     ):
         super().__init__(
@@ -75,12 +79,15 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
+        self.use_rms_norm = use_rms_norm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
         self.use_cache = use_cache
         self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         self.head_dim = head_dim
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
+        self.linear_bias = linear_bias
diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
new file mode 100644
index 000000000000..4fc77221d8b0
--- /dev/null
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -0,0 +1,97 @@
+
+import os
+import math
+import json
+
+import torch
+from safetensors.torch import load_file as safe_load_file
+from transformers import GlmConfig, GlmForCausalLM
+
+STATE_DICT_MAPPING = {
+    "transformer.": "model.",
+    "transformer.output_layer.": "lm_head.",
+    ".embedding.": ".embed_tokens.",
+    ".encoder.layers.": ".layers.",
+    "final_layernorm.": "norm.",
+    "rotary_pos_embed.": "rotary_emb.",
+    "self_attention.": "self_attn.",
+    "query_key_value.": "qkv_proj.",
+    "dense.": "o_proj.",
+    "dense_h_to_4h.": "gate_up_proj.",
+    "dense_4h_to_h.": "down_proj."
+}
+
+
+def merge_safetensors(input_dir: str):
+    all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith('.safetensors')]
+    all_files = sorted(all_files, key=lambda x: int(x.split('-', 2)[1]))
+
+    output_path = os.path.join(input_dir, 'consolidated.safetensors')
+    with open(output_path, "wb") as f_out:
+        for filepath in all_files:
+            with open(filepath, "rb") as f_in:
+                f_out.write(f_in.read())
+
+
+def convert_state_dict(original_state_dict: dict):
+    new_dict = {}
+
+    for key, value in original_state_dict.items():
+        new_key = key
+        for old, new in STATE_DICT_MAPPING.items():
+            new_key = new_key.replace(old, new)
+
+        new_dict[new_key] = value
+    return new_dict
+
+
+def convert_config(original_config: dict):
+    new_config = GlmConfig(
+        vocab_size=original_config.pop("vocab_size"),
+        hidden_size=original_config.pop("hidden_size"),
+        intermediate_size=original_config.pop("ffn_hidden_size"),
+        num_hidden_layers=original_config.pop("num_hidden_layer"),
+        num_attention_heads=original_config.pop("num_attention_heads"),
+        num_key_value_heads=original_config.pop("multi_query_group_num"),
+        resid_pdrop=original_config.pop("hidden_dropout"),
+        attention_dropout=original_config.pop("attention_dropout"),
+        max_position_embeddings=original_config.pop("max_position_embeddings"),
+        initializer_range=original_config.pop("initializer_range"),
+        rms_norm_eps=original_config.pop("layernorm_epsilon"),
+        use_rms_norm=original_config.pop("rmsnorm"),
+        apply_residual_connection_post_layernorm=original_config.pop("apply_residual_connection_post_layernorm"),
+        post_layer_norm=original_config.pop("post_layer_norm"),
+        use_cache=original_config.pop("use_cache"),
+        rope_scaling={
+            "rope_type": "linear",
+            "factor": original_config.pop("rope_ratio")
+        },
+        head_dim=original_config.pop("kv_channels"),
+        attention_bias=original_config.pop("add_qkv_bias"),
+        linear_bias=original_config.pop("add_bias_linear"),
+    )
+    return new_config
+
+
+def convert_glm_model(input_dir, output_dir):
+    
+    # Load and convert config
+    with open(os.path.join(input_dir, "config.json")) as f:
+        original_config = json.load(f)
+    config = convert_config(original_config)
+    config.save_pretrained(output_dir)
+
+    # Load and convert weights
+    merge_safetensors(input_dir)
+    original_state_dict = safe_load_file(os.path.join(input_dir, "consolidated.safetensors"))
+    new_dict = convert_state_dict(original_state_dict)
+    with torch.device("meta"):
+        model = GlmForCausalLM.from_config(config)
+    model.load_state_dict(new_dict, strict=True, assign=True)
+    model.save_pretrained(output_dir)
+
+
+    tokenizer = convert_mistral_tokenizer()
+    image_processor = PixtralImageProcessor()
+    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
+    processor.save_pretrained(output_dir)
\ No newline at end of file
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 1c9d7da3fe59..345810eeec1e 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -248,8 +248,9 @@ def __init__(self, config):
         super().__init__()
 
         self.config = config
-        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=self.config.linear_bias)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=self.config.linear_bias)
 
         self.activation_fn = ACT2FN[config.hidden_act]
 
@@ -303,8 +304,8 @@ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
             )
 
         op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=self.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=self.config.linear_bias)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=self.attention_bias or self.config.linear_bias)
 
     def forward(
         self,
@@ -634,11 +635,20 @@ def __init__(self, config: GlmConfig, layer_idx: int):
         self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
 
         self.mlp = GlmMLP(config)
-        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm = (
+            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_rms_norm
+            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        )
 
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.post_attention_layernorm = (
+            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_rms_norm
+            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        )
 
     def forward(
         self,
@@ -675,13 +685,12 @@ def forward(
                 into the model
         """
 
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states_after_norm = self.input_layernorm(hidden_states)
+        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
 
         # Self Attention
         attn_outputs, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
+            hidden_states=hidden_states_after_norm,
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
@@ -693,9 +702,10 @@ def forward(
 
         hidden_states = residual + self.resid_attn_dropout(attn_outputs)
 
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
+        hidden_states_after_norm = self.post_attention_layernorm(hidden_states)
+        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
+
+        hidden_states = self.mlp(hidden_states_after_norm)
         hidden_states = residual + self.resid_mlp_dropout(hidden_states)
 
         outputs = (hidden_states,)
@@ -853,9 +863,10 @@ def __init__(self, config: GlmConfig):
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = norm_func(config.hidden_size, eps=config.rms_norm_eps) if config.post_layer_norm else nn.Identity()
         self.rotary_emb = GlmRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
+        norm_func = GlmRMSNorm if config.use_rms_norm else nn.LayerNorm
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 5687066e620a..5c3a1480107e 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -58,7 +58,7 @@ class GlmConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=151552,
+        vocab_size=65024,
         hidden_size=4096,
         intermediate_size=13696,
         num_hidden_layers=40,
@@ -70,6 +70,9 @@ def __init__(
         max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=1e-5,
+        use_rms_norm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
@@ -80,6 +83,7 @@ def __init__(
         head_dim=128,
         partial_rotary_factor=0.5,
         attention_bias=True,
+        linear_bias=False,
         **kwargs,
     ):
         super().__init__(
@@ -90,26 +94,29 @@ def __init__(
             **kwargs,
         )
         self.vocab_size = vocab_size
-        self.hidden_size=hidden_size
-        self.intermediate_size=intermediate_size
-        self.num_hidden_layers=num_hidden_layers
-        self.num_attention_heads=num_attention_heads
-        self.num_key_value_heads=num_key_value_heads
-        self.resid_pdrop=resid_pdrop
-        self.attention_dropout=attention_dropout
-        self.hidden_act=hidden_act
-        self.max_position_embeddings=max_position_embeddings
-        self.initializer_range=initializer_range
-        self.rms_norm_eps=rms_norm_eps
-        self.use_cache=use_cache
-        self.initializer_range=initializer_range
-        self.rms_norm_eps=rms_norm_eps
-        self.use_cache=use_cache
-        self.rope_theta=rope_theta
-        self.rope_scaling=rope_scaling
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_rms_norm = use_rms_norm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
         self.head_dim = head_dim
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
+        self.linear_bias = linear_bias
 
 
 
@@ -122,7 +129,11 @@ class GlmRotaryEmbedding(LlamaRotaryEmbedding):
 
 
 class GlmMLP(Phi3MLP):
-    pass
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=self.config.linear_bias)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=self.config.linear_bias)
 
 
 class GlmAttention(nn.Module):
@@ -154,8 +165,8 @@ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
             )
 
         op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=self.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=self.config.linear_bias)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=self.attention_bias or self.config.linear_bias)
 
     def forward(
         self,
@@ -431,14 +442,15 @@ def __init__(self, config: GlmConfig, layer_idx: int):
         super().__init__()
 
         self.config = config
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
         self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
 
         self.mlp = GlmMLP(config)
-        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
 
 
     def forward(
@@ -476,13 +488,12 @@ def forward(
                 into the model
         """
 
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states_after_norm = self.input_layernorm(hidden_states)
+        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
 
         # Self Attention
         attn_outputs, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
+            hidden_states=hidden_states_after_norm,
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
@@ -494,9 +505,10 @@ def forward(
 
         hidden_states = residual + self.resid_attn_dropout(attn_outputs)
 
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
+        hidden_states_after_norm = self.post_attention_layernorm(hidden_states)
+        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
+
+        hidden_states = self.mlp(hidden_states_after_norm)
         hidden_states = residual + self.resid_mlp_dropout(hidden_states)
 
         outputs = (hidden_states,)
@@ -517,7 +529,8 @@ def __init__(self, config: GlmConfig):
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        norm_func = GlmRMSNorm if config.use_rms_norm else nn.LayerNorm
+        self.norm = norm_func(config.hidden_size, eps=config.rms_norm_eps) if config.post_layer_norm else nn.Identity()
         self.rotary_emb = GlmRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
 

From 603421e869eb2d8f2d789eee1ab5bdd3f45ed1a8 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 25 Sep 2024 20:00:52 +0200
Subject: [PATCH 09/39] Last update

---
 src/transformers/models/glm/configuration_glm.py         | 4 ++--
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 +-
 src/transformers/models/glm/modular_glm.py               | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
index d69e2c342a13..1431b49a900c 100644
--- a/src/transformers/models/glm/configuration_glm.py
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -29,7 +29,7 @@ class GlmConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=65024,
+        vocab_size=151552,
         hidden_size=4096,
         intermediate_size=13696,
         num_hidden_layers=40,
@@ -40,7 +40,7 @@ def __init__(
         hidden_act="silu",
         max_position_embeddings=131072,
         initializer_range=0.02,
-        rms_norm_eps=1e-5,
+        rms_norm_eps=0.00000015625,
         use_rms_norm=True,
         apply_residual_connection_post_layernorm=False,
         post_layer_norm=True,
diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 4fc77221d8b0..c13e99fc67c6 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -47,7 +47,7 @@ def convert_state_dict(original_state_dict: dict):
 
 def convert_config(original_config: dict):
     new_config = GlmConfig(
-        vocab_size=original_config.pop("vocab_size"),
+        vocab_size=original_config.pop("padded_vocab_size"),
         hidden_size=original_config.pop("hidden_size"),
         intermediate_size=original_config.pop("ffn_hidden_size"),
         num_hidden_layers=original_config.pop("num_hidden_layer"),
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 5c3a1480107e..b26644c5fd49 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -58,7 +58,7 @@ class GlmConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=65024,
+        vocab_size=151552,
         hidden_size=4096,
         intermediate_size=13696,
         num_hidden_layers=40,
@@ -69,7 +69,7 @@ def __init__(
         hidden_act="silu",
         max_position_embeddings=131072,
         initializer_range=0.02,
-        rms_norm_eps=1e-5,
+        rms_norm_eps=0.00000015625,
         use_rms_norm=True,
         apply_residual_connection_post_layernorm=False,
         post_layer_norm=True,

From 9e0dfeea04a2c5c9c48707695e2d32956ae1418c Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 10:37:45 +0200
Subject: [PATCH 10/39] Finalize model

---
 .../models/glm/configuration_glm.py           |   7 -
 .../models/glm/convert_glm_weights_to_hf.py   |  13 +-
 src/transformers/models/glm/modeling_glm.py   | 251 ++++++------------
 src/transformers/models/glm/modular_glm.py    |  30 +--
 4 files changed, 98 insertions(+), 203 deletions(-)

diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
index 1431b49a900c..ad7d83f75edb 100644
--- a/src/transformers/models/glm/configuration_glm.py
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -47,15 +47,10 @@ def __init__(
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
-        rope_scaling={
-            "rope_type": "linear",
-            "factor": 1.0,
-        },
         pad_token_id=151329,
         eos_token_id=[151329, 151336, 151338],
         bos_token_id=None,
         head_dim=128,
-        partial_rotary_factor=0.5,
         attention_bias=True,
         linear_bias=False,
         **kwargs,
@@ -86,8 +81,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
         self.head_dim = head_dim
-        self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
         self.linear_bias = linear_bias
diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index c13e99fc67c6..a7f068898c3e 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -46,30 +46,31 @@ def convert_state_dict(original_state_dict: dict):
 
 
 def convert_config(original_config: dict):
+
+    num_attention_heads = original_config.pop("num_attention_heads")
+
     new_config = GlmConfig(
         vocab_size=original_config.pop("padded_vocab_size"),
         hidden_size=original_config.pop("hidden_size"),
         intermediate_size=original_config.pop("ffn_hidden_size"),
         num_hidden_layers=original_config.pop("num_hidden_layer"),
-        num_attention_heads=original_config.pop("num_attention_heads"),
-        num_key_value_heads=original_config.pop("multi_query_group_num"),
+        num_attention_heads=num_attention_heads,
+        num_key_value_heads=num_attention_heads if not original_config.pop("multi_query_attention") else original_config.pop("multi_query_group_num"),
         resid_pdrop=original_config.pop("hidden_dropout"),
         attention_dropout=original_config.pop("attention_dropout"),
         max_position_embeddings=original_config.pop("max_position_embeddings"),
         initializer_range=original_config.pop("initializer_range"),
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
+        rope_theta=10000. * original_config.pop("rope_ratio"),
         use_rms_norm=original_config.pop("rmsnorm"),
         apply_residual_connection_post_layernorm=original_config.pop("apply_residual_connection_post_layernorm"),
         post_layer_norm=original_config.pop("post_layer_norm"),
         use_cache=original_config.pop("use_cache"),
-        rope_scaling={
-            "rope_type": "linear",
-            "factor": original_config.pop("rope_ratio")
-        },
         head_dim=original_config.pop("kv_channels"),
         attention_bias=original_config.pop("add_qkv_bias"),
         linear_bias=original_config.pop("add_bias_linear"),
     )
+    print(f'Unused config keys: {original_config.keys(),}')
     return new_config
 
 
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 345810eeec1e..a4a1b312426e 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -38,7 +38,6 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
@@ -56,6 +55,63 @@
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
+logger = logging.get_logger(__name__)
+
+
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+    return causal_mask
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Glm
 class GlmRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -76,118 +132,24 @@ def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
-logger = logging.get_logger(__name__)
-
-
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->glm, Gemma->Glm
 class GlmRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim=None,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-        rope_type="default",
-        config: Optional[GlmConfig] = None,
-    ):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        # TODO (joao): remove the `if` below, only used for BC
-        self.rope_kwargs = {}
-        if config is None:
-            logger.warning_once(
-                "`GlmRotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.46"
-            )
-            self.rope_kwargs = {
-                "rope_type": rope_type,
-                "factor": scaling_factor,
-                "dim": dim,
-                "base": base,
-                "max_position_embeddings": max_position_embeddings,
-            }
-            self.rope_type = rope_type
-            self.max_seq_len_cached = max_position_embeddings
-            self.original_max_seq_len = max_position_embeddings
-        else:
-            # BC: "rope_type" was originally "type"
-            if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-            else:
-                self.rope_type = "default"
-            self.max_seq_len_cached = config.max_position_embeddings
-            self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
 
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class GlmLongRoPEScaledRotaryEmbedding(GlmRotaryEmbedding):
-    def __init__(self, dim, config, device=None):
-        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
-
-        self.short_factor = config.rope_scaling["short_factor"]
-        self.long_factor = config.rope_scaling["long_factor"]
-        self.original_max_position_embeddings = config.original_max_position_embeddings
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
 
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
-        seq_len = seq_len or torch.max(position_ids) + 1
-        if seq_len > self.original_max_position_embeddings:
-            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
-        else:
-            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-
-        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
-        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
-
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
@@ -195,19 +157,11 @@ def forward(self, x, position_ids, seq_len=None):
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-
-            scale = self.max_position_embeddings / self.original_max_position_embeddings
-            if scale <= 1.0:
-                scaling_factor = 1.0
-            else:
-                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
-
-            cos = emb.cos() * scaling_factor
-            sin = emb.sin() * scaling_factor
+            cos = emb.cos()
+            sin = emb.sin()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-# Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -215,7 +169,6 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -263,7 +216,6 @@ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
         return self.down_proj(up_states)
 
 
-# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@@ -567,59 +519,6 @@ def forward(
         return attn_output, None, past_key_value
 
 
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    min_dtype: float,
-    cache_position: torch.Tensor,
-    batch_size: int,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to plcae the 4D attention mask on.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-
-    return causal_mask
-
-
 GLM_ATTENTION_CLASSES = {
     "eager": GlmAttention,
     "flash_attention_2": GlmFlashAttention2,
@@ -632,6 +531,7 @@ def __init__(self, config: GlmConfig, layer_idx: int):
         super().__init__()
 
         self.config = config
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
         self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
 
         self.mlp = GlmMLP(config)
@@ -643,7 +543,6 @@ def __init__(self, config: GlmConfig, layer_idx: int):
 
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
         self.post_attention_layernorm = (
             GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
             if config.use_rms_norm
@@ -863,10 +762,16 @@ def __init__(self, config: GlmConfig):
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm = norm_func(config.hidden_size, eps=config.rms_norm_eps) if config.post_layer_norm else nn.Identity()
-        self.rotary_emb = GlmRotaryEmbedding(config=config)
+        self.norm = norm if config.post_layer_norm else nn.Identity()
+        self.rotary_emb = GlmRotaryEmbedding(
+            dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
+        )
         self.gradient_checkpointing = False
-        norm_func = GlmRMSNorm if config.use_rms_norm else nn.LayerNorm
+        norm = (
+            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_rms_norm
+            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        )
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index b26644c5fd49..f4e08af4d33a 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -28,16 +28,16 @@
     is_flash_attn_greater_or_equal_2_10,
     logging,
 )
-from ..llama.modeling_llama import (
-    LlamaRMSNorm,
-    LlamaRotaryEmbedding,
-    LlamaModel,
+from ..phi3.modeling_phi3 import (
+    Phi3RMSNorm,
+    Phi3RotaryEmbedding,
+    Phi3MLP,
+    Phi3DecoderLayer,
     apply_rotary_pos_emb,
     repeat_kv,
 )
-from ..phi3.modeling_phi3 import (
-    Phi3MLP,
-    Phi3DecoderLayer
+from ..llama.modeling_llama import (
+    LlamaModel,
 )
 from ..gemma.modeling_gemma import (
     GemmaForCausalLM,
@@ -76,12 +76,10 @@ def __init__(
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
-        rope_scaling={"rope_type": "linear", "factor": 1.,},
         pad_token_id=151329,
         eos_token_id=[151329, 151336, 151338],
         bos_token_id=None,
         head_dim=128,
-        partial_rotary_factor=0.5,
         attention_bias=True,
         linear_bias=False,
         **kwargs,
@@ -112,19 +110,17 @@ def __init__(
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
         self.head_dim = head_dim
-        self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
         self.linear_bias = linear_bias
 
 
 
-class GlmRMSNorm(LlamaRMSNorm):
+class GlmRMSNorm(Phi3RMSNorm):
     pass
 
 
-class GlmRotaryEmbedding(LlamaRotaryEmbedding):
+class GlmRotaryEmbedding(Phi3RotaryEmbedding):
     pass
 
 
@@ -436,7 +432,7 @@ def forward(
 }
 
 
-class GlmDecoderLayer(Phi3DecoderLayer):
+class GlmDecoderLayer(nn.Module):
 
     def __init__(self, config: GlmConfig, layer_idx: int):
         super().__init__()
@@ -529,9 +525,9 @@ def __init__(self, config: GlmConfig):
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        norm_func = GlmRMSNorm if config.use_rms_norm else nn.LayerNorm
-        self.norm = norm_func(config.hidden_size, eps=config.rms_norm_eps) if config.post_layer_norm else nn.Identity()
-        self.rotary_emb = GlmRotaryEmbedding(config=config)
+        norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = norm if config.post_layer_norm else nn.Identity()
+        self.rotary_emb = GlmRotaryEmbedding(dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta)
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing

From 414100d479e3db48d9bf512ecc581a4c3dd3d69e Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 15:37:04 +0200
Subject: [PATCH 11/39] Finalize converter

---
 .../models/glm/convert_glm_weights_to_hf.py   | 76 +++++++++++++++++--
 1 file changed, 70 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index a7f068898c3e..3b36410ded39 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -1,11 +1,13 @@
 
 import os
-import math
+import argparse
 import json
 
 import torch
 from safetensors.torch import load_file as safe_load_file
-from transformers import GlmConfig, GlmForCausalLM
+from transformers import GlmConfig, GlmForCausalLM, AutoTokenizer, PreTrainedTokenizerFast
+from transformers.convert_slow_tokenizer import TikTokenConverter
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, pre_tokenizers, processors
 
 STATE_DICT_MAPPING = {
     "transformer.": "model.",
@@ -22,6 +24,36 @@
 }
 
 
+class GlmConverter(TikTokenConverter):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def converted(self) -> Tokenizer:
+        tokenizer = self.tokenizer()
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
+                pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
+            ]
+        )
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.add_special_tokens(self.additional_special_tokens)
+
+        tokenizer.post_processor = processors.Sequence(
+            [
+                processors.ByteLevel(trim_offsets=False),
+                processors.TemplateProcessing(
+                    single=f"[gMASK]:0 <sop>:0 $A:0",
+                    pair=f"[gMASK]:0 <sop>:0 $A:0 $B:1",
+                    special_tokens=[("[gMASK]", 151331), ("<sop>",151333)]
+                )
+            ],
+        )
+
+        return tokenizer
+
+
 def merge_safetensors(input_dir: str):
     all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith('.safetensors')]
     all_files = sorted(all_files, key=lambda x: int(x.split('-', 2)[1]))
@@ -74,6 +106,22 @@ def convert_config(original_config: dict):
     return new_config
 
 
+def convert_glm_tokenizer(input_dir):
+
+    fast_tok = GlmConverter(os.path.join(input_dir, 'tokenizer.model'), additional_special_tokens=[]).converted()
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", trust_remote_code=True)
+    new_tok = PreTrainedTokenizerFast(tokenizer_object=fast_tok,
+                                      bos_token=tokenizer.bos_token,
+                                      eos_token=tokenizer.eos_token,
+                                      pad_token=tokenizer.pad_token,
+                                      clean_up_tokenization_spaces=tokenizer.clean_up_tokenization_spaces,
+                                      additional_special_tokens=tokenizer.additional_special_tokens,
+                                      padding_side=tokenizer.padding_side
+                                      )
+
+    return new_tok
+
+
 def convert_glm_model(input_dir, output_dir):
     
     # Load and convert config
@@ -91,8 +139,24 @@ def convert_glm_model(input_dir, output_dir):
     model.load_state_dict(new_dict, strict=True, assign=True)
     model.save_pretrained(output_dir)
 
+    # Load and convert tokenizer
+    tokenizer = convert_glm_tokenizer(input_dir)
+    tokenizer.save_pretrained(output_dir)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Location of the local folder copied from the Hub.",
+    )
+    parser.add_argument(
+        "output_dir",
+        type=str,
+        help="Location to write HF model and tokenizer",
+    )
+
+    args = parser.parse_args()
+    convert_glm_model(args.input_dir, args.output_dir)
 
-    tokenizer = convert_mistral_tokenizer()
-    image_processor = PixtralImageProcessor()
-    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
-    processor.save_pretrained(output_dir)
\ No newline at end of file

From b816507c7ae12d89f07bba9d3497a84a4c55e265 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 15:57:32 +0200
Subject: [PATCH 12/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 3b36410ded39..6bd52bcd7ec7 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -101,6 +101,8 @@ def convert_config(original_config: dict):
         head_dim=original_config.pop("kv_channels"),
         attention_bias=original_config.pop("add_qkv_bias"),
         linear_bias=original_config.pop("add_bias_linear"),
+        eos_token_id=original_config.pop("eos_token_id"),
+        pad_token_id=original_config["pad_token_id"],
     )
     print(f'Unused config keys: {original_config.keys(),}')
     return new_config

From 590321b3b29ecea729f6d25dd6e1ca58a1f82797 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 16:07:46 +0200
Subject: [PATCH 13/39] style

---
 .../models/glm/convert_glm_weights_to_hf.py   | 59 +++++++++---------
 src/transformers/models/glm/modular_glm.py    | 60 ++++++++++---------
 2 files changed, 61 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 6bd52bcd7ec7..2372631595b3 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -1,13 +1,14 @@
-
-import os
 import argparse
 import json
+import os
 
 import torch
 from safetensors.torch import load_file as safe_load_file
-from transformers import GlmConfig, GlmForCausalLM, AutoTokenizer, PreTrainedTokenizerFast
+from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
+
+from transformers import AutoTokenizer, GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast
 from transformers.convert_slow_tokenizer import TikTokenConverter
-from tokenizers import AddedToken, Regex, Tokenizer, decoders, pre_tokenizers, processors
+
 
 STATE_DICT_MAPPING = {
     "transformer.": "model.",
@@ -20,12 +21,11 @@
     "query_key_value.": "qkv_proj.",
     "dense.": "o_proj.",
     "dense_h_to_4h.": "gate_up_proj.",
-    "dense_4h_to_h.": "down_proj."
+    "dense_4h_to_h.": "down_proj.",
 }
 
 
 class GlmConverter(TikTokenConverter):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -44,10 +44,10 @@ def converted(self) -> Tokenizer:
             [
                 processors.ByteLevel(trim_offsets=False),
                 processors.TemplateProcessing(
-                    single=f"[gMASK]:0 <sop>:0 $A:0",
-                    pair=f"[gMASK]:0 <sop>:0 $A:0 $B:1",
-                    special_tokens=[("[gMASK]", 151331), ("<sop>",151333)]
-                )
+                    single="[gMASK]:0 <sop>:0 $A:0",
+                    pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
+                    special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
+                ),
             ],
         )
 
@@ -55,10 +55,10 @@ def converted(self) -> Tokenizer:
 
 
 def merge_safetensors(input_dir: str):
-    all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith('.safetensors')]
-    all_files = sorted(all_files, key=lambda x: int(x.split('-', 2)[1]))
+    all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
+    all_files = sorted(all_files, key=lambda x: int(x.split("-", 2)[1]))
 
-    output_path = os.path.join(input_dir, 'consolidated.safetensors')
+    output_path = os.path.join(input_dir, "consolidated.safetensors")
     with open(output_path, "wb") as f_out:
         for filepath in all_files:
             with open(filepath, "rb") as f_in:
@@ -78,7 +78,6 @@ def convert_state_dict(original_state_dict: dict):
 
 
 def convert_config(original_config: dict):
-
     num_attention_heads = original_config.pop("num_attention_heads")
 
     new_config = GlmConfig(
@@ -87,13 +86,15 @@ def convert_config(original_config: dict):
         intermediate_size=original_config.pop("ffn_hidden_size"),
         num_hidden_layers=original_config.pop("num_hidden_layer"),
         num_attention_heads=num_attention_heads,
-        num_key_value_heads=num_attention_heads if not original_config.pop("multi_query_attention") else original_config.pop("multi_query_group_num"),
+        num_key_value_heads=num_attention_heads
+        if not original_config.pop("multi_query_attention")
+        else original_config.pop("multi_query_group_num"),
         resid_pdrop=original_config.pop("hidden_dropout"),
         attention_dropout=original_config.pop("attention_dropout"),
         max_position_embeddings=original_config.pop("max_position_embeddings"),
         initializer_range=original_config.pop("initializer_range"),
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
-        rope_theta=10000. * original_config.pop("rope_ratio"),
+        rope_theta=10000.0 * original_config.pop("rope_ratio"),
         use_rms_norm=original_config.pop("rmsnorm"),
         apply_residual_connection_post_layernorm=original_config.pop("apply_residual_connection_post_layernorm"),
         post_layer_norm=original_config.pop("post_layer_norm"),
@@ -104,28 +105,27 @@ def convert_config(original_config: dict):
         eos_token_id=original_config.pop("eos_token_id"),
         pad_token_id=original_config["pad_token_id"],
     )
-    print(f'Unused config keys: {original_config.keys(),}')
+    print(f"Unused config keys: {original_config.keys(),}")
     return new_config
 
 
 def convert_glm_tokenizer(input_dir):
-
-    fast_tok = GlmConverter(os.path.join(input_dir, 'tokenizer.model'), additional_special_tokens=[]).converted()
+    fast_tok = GlmConverter(os.path.join(input_dir, "tokenizer.model"), additional_special_tokens=[]).converted()
     tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", trust_remote_code=True)
-    new_tok = PreTrainedTokenizerFast(tokenizer_object=fast_tok,
-                                      bos_token=tokenizer.bos_token,
-                                      eos_token=tokenizer.eos_token,
-                                      pad_token=tokenizer.pad_token,
-                                      clean_up_tokenization_spaces=tokenizer.clean_up_tokenization_spaces,
-                                      additional_special_tokens=tokenizer.additional_special_tokens,
-                                      padding_side=tokenizer.padding_side
-                                      )
+    new_tok = PreTrainedTokenizerFast(
+        tokenizer_object=fast_tok,
+        bos_token=tokenizer.bos_token,
+        eos_token=tokenizer.eos_token,
+        pad_token=tokenizer.pad_token,
+        clean_up_tokenization_spaces=tokenizer.clean_up_tokenization_spaces,
+        additional_special_tokens=tokenizer.additional_special_tokens,
+        padding_side=tokenizer.padding_side,
+    )
 
     return new_tok
 
 
 def convert_glm_model(input_dir, output_dir):
-    
     # Load and convert config
     with open(os.path.join(input_dir, "config.json")) as f:
         original_config = json.load(f)
@@ -146,7 +146,7 @@ def convert_glm_model(input_dir, output_dir):
     tokenizer.save_pretrained(output_dir)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "input_dir",
@@ -161,4 +161,3 @@ def convert_glm_model(input_dir, output_dir):
 
     args = parser.parse_args()
     convert_glm_model(args.input_dir, args.output_dir)
-
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index f4e08af4d33a..b06a64a31534 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -28,22 +28,21 @@
     is_flash_attn_greater_or_equal_2_10,
     logging,
 )
+from ..gemma.modeling_gemma import (
+    GemmaForCausalLM,
+    GemmaForSequenceClassification,
+    GemmaForTokenClassification,
+)
+from ..llama.modeling_llama import (
+    LlamaModel,
+)
 from ..phi3.modeling_phi3 import (
+    Phi3MLP,
     Phi3RMSNorm,
     Phi3RotaryEmbedding,
-    Phi3MLP,
-    Phi3DecoderLayer,
     apply_rotary_pos_emb,
     repeat_kv,
 )
-from ..llama.modeling_llama import (
-    LlamaModel,
-)
-from ..gemma.modeling_gemma import (
-    GemmaForCausalLM,
-    GemmaForSequenceClassification,
-    GemmaForTokenClassification,
-)
 
 
 if is_flash_attn_2_available():
@@ -115,7 +114,6 @@ def __init__(
         self.linear_bias = linear_bias
 
 
-
 class GlmRMSNorm(Phi3RMSNorm):
     pass
 
@@ -133,7 +131,6 @@ def __init__(self, config):
 
 
 class GlmAttention(nn.Module):
-
     def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
@@ -227,7 +224,7 @@ def forward(
             attn_weights = None
 
         return attn_output, attn_weights, past_key_value
-    
+
 
 class GlmFlashAttention2(GlmAttention):
     """
@@ -255,7 +252,6 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-
         output_attentions = False
 
         bsz, q_len, _ = hidden_states.size()
@@ -336,7 +332,7 @@ def forward(
             attn_weights = None
 
         return attn_output, attn_weights, past_key_value
-    
+
 
 class GlmSdpaAttention(GlmAttention):
     """
@@ -423,7 +419,7 @@ def forward(
         attn_output = self.o_proj(attn_output)
 
         return attn_output, None, past_key_value
-    
+
 
 GLM_ATTENTION_CLASSES = {
     "eager": GlmAttention,
@@ -433,7 +429,6 @@ def forward(
 
 
 class GlmDecoderLayer(nn.Module):
-
     def __init__(self, config: GlmConfig, layer_idx: int):
         super().__init__()
 
@@ -442,12 +437,19 @@ def __init__(self, config: GlmConfig, layer_idx: int):
         self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
 
         self.mlp = GlmMLP(config)
-        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm = (
+            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_rms_norm
+            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        )
 
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-
+        self.post_attention_layernorm = (
+            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_rms_norm
+            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        )
 
     def forward(
         self,
@@ -496,7 +498,7 @@ def forward(
             output_attentions=output_attentions,
             use_cache=use_cache,
             cache_position=cache_position,
-            position_embeddings=position_embeddings
+            position_embeddings=position_embeddings,
         )
 
         hidden_states = residual + self.resid_attn_dropout(attn_outputs)
@@ -519,15 +521,20 @@ def forward(
 
 
 class GlmModel(LlamaModel):
-
     def __init__(self, config: GlmConfig):
         super().__init__(config)
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        norm = (
+            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.use_rms_norm
+            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        )
         self.norm = norm if config.post_layer_norm else nn.Identity()
-        self.rotary_emb = GlmRotaryEmbedding(dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta)
+        self.rotary_emb = GlmRotaryEmbedding(
+            dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
+        )
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing
@@ -535,7 +542,6 @@ def __init__(self, config: GlmConfig):
 
 
 class GlmForCausalLM(GemmaForCausalLM):
-
     def __init__(self, config):
         super().__init__(config)
         self.model = GlmModel(config)
@@ -543,7 +549,6 @@ def __init__(self, config):
 
 
 class GlmForSequenceClassification(GemmaForSequenceClassification):
-
     def __init__(self, config):
         super().__init__(config)
         self.model = GlmModel(config)
@@ -551,8 +556,7 @@ def __init__(self, config):
 
 
 class GlmForTokenClassification(GemmaForTokenClassification):
-
     def __init__(self, config):
         super().__init__(config)
         self.model = GlmModel(config)
-        self.post_init()
\ No newline at end of file
+        self.post_init()

From 85cbd60b1160e58ccf8f8aea0830511981f224fd Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 16:12:06 +0200
Subject: [PATCH 14/39] style

---
 src/transformers/models/glm/modeling_glm.py | 15 +++++++++------
 src/transformers/models/glm/modular_glm.py  | 14 ++++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index a4a1b312426e..8201101fbec6 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -762,16 +762,19 @@ def __init__(self, config: GlmConfig):
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm = norm if config.post_layer_norm else nn.Identity()
+        self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = GlmRotaryEmbedding(
             dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
         )
         self.gradient_checkpointing = False
-        norm = (
-            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            if config.use_rms_norm
-            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        )
+        if config.post_layer_norm:
+            self.norm = (
+                GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                if config.use_rms_norm
+                else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+            )
+        else:
+            self.norm = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index b06a64a31534..b6688198de90 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -526,12 +526,14 @@ def __init__(self, config: GlmConfig):
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        norm = (
-            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            if config.use_rms_norm
-            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        )
-        self.norm = norm if config.post_layer_norm else nn.Identity()
+        if config.post_layer_norm:
+            self.norm = (
+                GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                if config.use_rms_norm
+                else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+            )
+        else:
+            self.norm = nn.Identity()
         self.rotary_emb = GlmRotaryEmbedding(
             dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
         )

From 7ba2f3a933f4c6081a64c9f988ed4746fc2453b6 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 16:38:22 +0200
Subject: [PATCH 15/39] Create __init__.py

---
 src/transformers/models/glm/__init__.py | 61 +++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 src/transformers/models/glm/__init__.py

diff --git a/src/transformers/models/glm/__init__.py b/src/transformers/models/glm/__init__.py
new file mode 100644
index 000000000000..7e5fecb03080
--- /dev/null
+++ b/src/transformers/models/glm/__init__.py
@@ -0,0 +1,61 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_glm": ["GlmConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_glm"] = [
+        "GlmForCausalLM",
+        "GlmModel",
+        "GlmPreTrainedModel",
+        "GlmForSequenceClassification",
+        "GlmForTokenClassification",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_glm import GlmConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_glm import (
+            GlmForCausalLM,
+            GlmForSequenceClassification,
+            GlmForTokenClassification,
+            GlmModel,
+            GlmPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

From fd727a6dbc3a949866ae4100c87e08edf66bcc26 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 16:55:06 +0200
Subject: [PATCH 16/39] Aff all inits

---
 src/transformers/__init__.py                  | 19 +++++++++++++++++++
 src/transformers/models/__init__.py           |  1 +
 .../models/auto/configuration_auto.py         |  2 ++
 src/transformers/models/auto/modeling_auto.py |  4 ++++
 .../models/auto/tokenization_auto.py          |  1 +
 5 files changed, 27 insertions(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 078e4d0e4abd..95ec1d874227 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -502,6 +502,7 @@
         "Kosmos2Config",
         "Kosmos2Processor",
     ],
+    "models.glm": ["GlmConfig"],
     "models.layoutlm": [
         "LayoutLMConfig",
         "LayoutLMTokenizer",
@@ -2549,6 +2550,16 @@
             "LlamaPreTrainedModel",
         ]
     )
+    _import_structure["models.glm"].extend(
+        [
+            "GlmForCausalLM",
+            "GlmForQuestionAnswering",
+            "GlmForSequenceClassification",
+            "GlmForTokenClassification",
+            "GlmModel",
+            "GlmPreTrainedModel",
+        ]
+    )
     _import_structure["models.llava"].extend(
         [
             "LlavaForConditionalGeneration",
@@ -5266,6 +5277,7 @@
         GitProcessor,
         GitVisionConfig,
     )
+    from .models.glm import GlmConfig
     from .models.glpn import GLPNConfig
     from .models.gpt2 import (
         GPT2Config,
@@ -6977,6 +6989,13 @@
             GitPreTrainedModel,
             GitVisionModel,
         )
+        from .models.glm import (
+            GlmForCausalLM,
+            GlmForSequenceClassification,
+            GlmForTokenClassification,
+            GlmModel,
+            GlmPreTrainedModel,
+        )
         from .models.glpn import (
             GLPNForDepthEstimation,
             GLPNModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index e47a4ed9c342..0b2c20f6eb80 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -97,6 +97,7 @@
     gemma,
     gemma2,
     git,
+    glm,
     glpn,
     gpt2,
     gpt_bigcode,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6d55f87d60ac..4e4e6603b404 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -114,6 +114,7 @@
         ("gemma", "GemmaConfig"),
         ("gemma2", "Gemma2Config"),
         ("git", "GitConfig"),
+        ("glm", "GlmConfig"),
         ("glpn", "GLPNConfig"),
         ("gpt-sw3", "GPT2Config"),
         ("gpt2", "GPT2Config"),
@@ -413,6 +414,7 @@
         ("gemma", "Gemma"),
         ("gemma2", "Gemma2"),
         ("git", "GIT"),
+        ("glm", "GLM"),
         ("glpn", "GLPN"),
         ("gpt-sw3", "GPT-Sw3"),
         ("gpt2", "OpenAI GPT-2"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 6e730e848db7..37598380dfab 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -111,6 +111,7 @@
         ("gemma", "GemmaModel"),
         ("gemma2", "Gemma2Model"),
         ("git", "GitModel"),
+        ("glm", "GlmModel"),
         ("glpn", "GLPNModel"),
         ("gpt-sw3", "GPT2Model"),
         ("gpt2", "GPT2Model"),
@@ -483,6 +484,7 @@
         ("gemma", "GemmaForCausalLM"),
         ("gemma2", "Gemma2ForCausalLM"),
         ("git", "GitForCausalLM"),
+        ("glm", "GlmForCausalLM"),
         ("gpt-sw3", "GPT2LMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
         ("gpt_bigcode", "GPTBigCodeForCausalLM"),
@@ -909,6 +911,7 @@
         ("funnel", "FunnelForSequenceClassification"),
         ("gemma", "GemmaForSequenceClassification"),
         ("gemma2", "Gemma2ForSequenceClassification"),
+        ("glm", "GlmForSequenceClassification"),
         ("gpt-sw3", "GPT2ForSequenceClassification"),
         ("gpt2", "GPT2ForSequenceClassification"),
         ("gpt_bigcode", "GPTBigCodeForSequenceClassification"),
@@ -1093,6 +1096,7 @@
         ("funnel", "FunnelForTokenClassification"),
         ("gemma", "GemmaForTokenClassification"),
         ("gemma2", "Gemma2ForTokenClassification"),
+        ("glm", "GlmForTokenClassification"),
         ("gpt-sw3", "GPT2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("gpt_bigcode", "GPTBigCodeForTokenClassification"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 6a5cba11f094..3be273d012a4 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -204,6 +204,7 @@
                 ),
             ),
             ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
             ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),

From dfa54bbd281bcad9ef613982b42dac3869eae476 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 16:59:54 +0200
Subject: [PATCH 17/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 2372631595b3..87f0a4a1dadc 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -84,7 +84,7 @@ def convert_config(original_config: dict):
         vocab_size=original_config.pop("padded_vocab_size"),
         hidden_size=original_config.pop("hidden_size"),
         intermediate_size=original_config.pop("ffn_hidden_size"),
-        num_hidden_layers=original_config.pop("num_hidden_layer"),
+        num_hidden_layers=original_config.pop("num_layers"),
         num_attention_heads=num_attention_heads,
         num_key_value_heads=num_attention_heads
         if not original_config.pop("multi_query_attention")

From ecd5bf462922367496f64515a6d9d691dece70b0 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:02:58 +0200
Subject: [PATCH 18/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 87f0a4a1dadc..512e40053779 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -91,7 +91,7 @@ def convert_config(original_config: dict):
         else original_config.pop("multi_query_group_num"),
         resid_pdrop=original_config.pop("hidden_dropout"),
         attention_dropout=original_config.pop("attention_dropout"),
-        max_position_embeddings=original_config.pop("max_position_embeddings"),
+        max_position_embeddings=original_config.pop("seq_length"),
         initializer_range=original_config.pop("initializer_range"),
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
         rope_theta=10000.0 * original_config.pop("rope_ratio"),

From ccacc3be63e3c8292c2fc2f20188fc79b0658175 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:06:05 +0200
Subject: [PATCH 19/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 512e40053779..682fecad8ff0 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -86,13 +86,14 @@ def convert_config(original_config: dict):
         intermediate_size=original_config.pop("ffn_hidden_size"),
         num_hidden_layers=original_config.pop("num_layers"),
         num_attention_heads=num_attention_heads,
-        num_key_value_heads=num_attention_heads
-        if not original_config.pop("multi_query_attention")
-        else original_config.pop("multi_query_group_num"),
+        num_key_value_heads=(
+            num_attention_heads
+            if not original_config.pop("multi_query_attention")
+            else original_config.pop("multi_query_group_num")
+        ),
         resid_pdrop=original_config.pop("hidden_dropout"),
         attention_dropout=original_config.pop("attention_dropout"),
         max_position_embeddings=original_config.pop("seq_length"),
-        initializer_range=original_config.pop("initializer_range"),
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
         rope_theta=10000.0 * original_config.pop("rope_ratio"),
         use_rms_norm=original_config.pop("rmsnorm"),

From bd9b9eebeeeb29ad946c7d7b8a0fc0b5406acb3e Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:08:36 +0200
Subject: [PATCH 20/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 682fecad8ff0..15c4d528d9c9 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -95,7 +95,7 @@ def convert_config(original_config: dict):
         attention_dropout=original_config.pop("attention_dropout"),
         max_position_embeddings=original_config.pop("seq_length"),
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
-        rope_theta=10000.0 * original_config.pop("rope_ratio"),
+        rope_theta=10000.0 * original_config.pop("rope_ratio", 1),
         use_rms_norm=original_config.pop("rmsnorm"),
         apply_residual_connection_post_layernorm=original_config.pop("apply_residual_connection_post_layernorm"),
         post_layer_norm=original_config.pop("post_layer_norm"),

From bba6b121ffa923d56127eb72e3e65dcfa562d93e Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:16:59 +0200
Subject: [PATCH 21/39] Update convert_glm_weights_to_hf.py

---
 .../models/glm/convert_glm_weights_to_hf.py   | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 15c4d528d9c9..4164396218f6 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -3,7 +3,7 @@
 import os
 
 import torch
-from safetensors.torch import load_file as safe_load_file
+from safetensors.torch import save_file, load_file
 from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
 
 from transformers import AutoTokenizer, GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast
@@ -58,11 +58,13 @@ def merge_safetensors(input_dir: str):
     all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
     all_files = sorted(all_files, key=lambda x: int(x.split("-", 2)[1]))
 
-    output_path = os.path.join(input_dir, "consolidated.safetensors")
-    with open(output_path, "wb") as f_out:
-        for filepath in all_files:
-            with open(filepath, "rb") as f_in:
-                f_out.write(f_in.read())
+    all_weights = {}
+    for file in all_files:
+        tensors = load_file(file)
+        all_weights.update(tensors)
+    
+    return all_weights
+    
 
 
 def convert_state_dict(original_state_dict: dict):
@@ -104,7 +106,7 @@ def convert_config(original_config: dict):
         attention_bias=original_config.pop("add_qkv_bias"),
         linear_bias=original_config.pop("add_bias_linear"),
         eos_token_id=original_config.pop("eos_token_id"),
-        pad_token_id=original_config["pad_token_id"],
+        pad_token_id=original_config.pop("pad_token_id"),
     )
     print(f"Unused config keys: {original_config.keys(),}")
     return new_config
@@ -134,8 +136,7 @@ def convert_glm_model(input_dir, output_dir):
     config.save_pretrained(output_dir)
 
     # Load and convert weights
-    merge_safetensors(input_dir)
-    original_state_dict = safe_load_file(os.path.join(input_dir, "consolidated.safetensors"))
+    original_state_dict = merge_safetensors(input_dir)
     new_dict = convert_state_dict(original_state_dict)
     with torch.device("meta"):
         model = GlmForCausalLM.from_config(config)

From 13934a8bc881d7f6ccae31d2f4e986decb6cf5d3 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:21:11 +0200
Subject: [PATCH 22/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 4164396218f6..8c702435e6c4 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -139,7 +139,7 @@ def convert_glm_model(input_dir, output_dir):
     original_state_dict = merge_safetensors(input_dir)
     new_dict = convert_state_dict(original_state_dict)
     with torch.device("meta"):
-        model = GlmForCausalLM.from_config(config)
+        model = GlmForCausalLM(config)
     model.load_state_dict(new_dict, strict=True, assign=True)
     model.save_pretrained(output_dir)
 

From 678062d2a9984bd4a8b6d381a838ed0eb93d333f Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:32:00 +0200
Subject: [PATCH 23/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 8c702435e6c4..2c8e265f26b1 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -11,9 +11,10 @@
 
 
 STATE_DICT_MAPPING = {
-    "transformer.": "model.",
     "transformer.output_layer.": "lm_head.",
-    ".embedding.": ".embed_tokens.",
+    "transformer.": "model.",
+    ".embedding.word_embeddings": ".embed_tokens.",
+    ".encoder.norm.": ".norm.",
     ".encoder.layers.": ".layers.",
     "final_layernorm.": "norm.",
     "rotary_pos_embed.": "rotary_emb.",
@@ -71,6 +72,10 @@ def convert_state_dict(original_state_dict: dict):
     new_dict = {}
 
     for key, value in original_state_dict.items():
+        # Should not be part of the state dict
+        if "rotary_pos_emb.inv_freq" in key:
+            continue
+
         new_key = key
         for old, new in STATE_DICT_MAPPING.items():
             new_key = new_key.replace(old, new)

From 967944f492ea5e096cb23a89f0b3e59478dc3f44 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:34:02 +0200
Subject: [PATCH 24/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 2c8e265f26b1..7f859204f5f0 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -13,7 +13,7 @@
 STATE_DICT_MAPPING = {
     "transformer.output_layer.": "lm_head.",
     "transformer.": "model.",
-    ".embedding.word_embeddings": ".embed_tokens.",
+    ".embedding.word_embeddings.": ".embed_tokens.",
     ".encoder.norm.": ".norm.",
     ".encoder.layers.": ".layers.",
     "final_layernorm.": "norm.",

From 2588ee77324123c465a06331bb6be7510ff51e5a Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 26 Sep 2024 17:36:58 +0200
Subject: [PATCH 25/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 7f859204f5f0..8884c0e13bef 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -14,9 +14,8 @@
     "transformer.output_layer.": "lm_head.",
     "transformer.": "model.",
     ".embedding.word_embeddings.": ".embed_tokens.",
-    ".encoder.norm.": ".norm.",
+    ".encoder.final_layernorm.": ".norm.",
     ".encoder.layers.": ".layers.",
-    "final_layernorm.": "norm.",
     "rotary_pos_embed.": "rotary_emb.",
     "self_attention.": "self_attn.",
     "query_key_value.": "qkv_proj.",

From 8756c1077cf05995df789f84d09c27bb73477622 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 16:12:07 +0200
Subject: [PATCH 26/39] Correct the rotary embeddings

---
 src/transformers/models/glm/modular_glm.py | 50 +++++++++++++++++++++-
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index b6688198de90..8173ba8f8db5 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -40,8 +40,6 @@
     Phi3MLP,
     Phi3RMSNorm,
     Phi3RotaryEmbedding,
-    apply_rotary_pos_emb,
-    repeat_kv,
 )
 
 
@@ -130,6 +128,54 @@ def __init__(self, config):
         self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=self.config.linear_bias)
 
 
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half for later concatenation
+    q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :]
+    k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :]
+
+    # Apply rotary embeddings on the first half
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+
+
 class GlmAttention(nn.Module):
     def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
         super().__init__()

From e633c22789625610b71d628bc5044c255dc20e0f Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 16:44:15 +0200
Subject: [PATCH 27/39] Remove apply_residual_connection_post_layernorm (always
 false)

---
 .../models/glm/convert_glm_weights_to_hf.py       |  1 -
 src/transformers/models/glm/modular_glm.py        | 15 ++++++---------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 8884c0e13bef..6540f97bbe79 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -103,7 +103,6 @@ def convert_config(original_config: dict):
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
         rope_theta=10000.0 * original_config.pop("rope_ratio", 1),
         use_rms_norm=original_config.pop("rmsnorm"),
-        apply_residual_connection_post_layernorm=original_config.pop("apply_residual_connection_post_layernorm"),
         post_layer_norm=original_config.pop("post_layer_norm"),
         use_cache=original_config.pop("use_cache"),
         head_dim=original_config.pop("kv_channels"),
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 8173ba8f8db5..3c0b84966378 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -68,7 +68,6 @@ def __init__(
         initializer_range=0.02,
         rms_norm_eps=0.00000015625,
         use_rms_norm=True,
-        apply_residual_connection_post_layernorm=False,
         post_layer_norm=True,
         use_cache=True,
         tie_word_embeddings=False,
@@ -101,7 +100,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_rms_norm = use_rms_norm
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
         self.post_layer_norm = post_layer_norm
         self.use_cache = use_cache
         self.initializer_range = initializer_range
@@ -479,7 +477,6 @@ def __init__(self, config: GlmConfig, layer_idx: int):
         super().__init__()
 
         self.config = config
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
         self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
 
         self.mlp = GlmMLP(config)
@@ -532,12 +529,12 @@ def forward(
                 into the model
         """
 
-        hidden_states_after_norm = self.input_layernorm(hidden_states)
-        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
         attn_outputs, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states_after_norm,
+            hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
@@ -548,11 +545,11 @@ def forward(
         )
 
         hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        residual = hidden_states
 
-        hidden_states_after_norm = self.post_attention_layernorm(hidden_states)
-        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
 
-        hidden_states = self.mlp(hidden_states_after_norm)
+        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + self.resid_mlp_dropout(hidden_states)
 
         outputs = (hidden_states,)

From ea3ee4e5d7eb025af84211946aa72479ed7aa9f6 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 16:49:42 +0200
Subject: [PATCH 28/39] remove use_rms_norm (always true)

---
 .../models/glm/convert_glm_weights_to_hf.py   |  1 -
 src/transformers/models/glm/modular_glm.py    | 20 +++----------------
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 6540f97bbe79..45dae5109c88 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -102,7 +102,6 @@ def convert_config(original_config: dict):
         max_position_embeddings=original_config.pop("seq_length"),
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
         rope_theta=10000.0 * original_config.pop("rope_ratio", 1),
-        use_rms_norm=original_config.pop("rmsnorm"),
         post_layer_norm=original_config.pop("post_layer_norm"),
         use_cache=original_config.pop("use_cache"),
         head_dim=original_config.pop("kv_channels"),
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 3c0b84966378..9893cf9f4d39 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -67,7 +67,6 @@ def __init__(
         max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=0.00000015625,
-        use_rms_norm=True,
         post_layer_norm=True,
         use_cache=True,
         tie_word_embeddings=False,
@@ -99,7 +98,6 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
-        self.use_rms_norm = use_rms_norm
         self.post_layer_norm = post_layer_norm
         self.use_cache = use_cache
         self.initializer_range = initializer_range
@@ -480,19 +478,11 @@ def __init__(self, config: GlmConfig, layer_idx: int):
         self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
 
         self.mlp = GlmMLP(config)
-        self.input_layernorm = (
-            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            if config.use_rms_norm
-            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        )
+        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = (
-            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            if config.use_rms_norm
-            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        )
+        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -570,11 +560,7 @@ def __init__(self, config: GlmConfig):
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         if config.post_layer_norm:
-            self.norm = (
-                GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-                if config.use_rms_norm
-                else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-            )
+            self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.norm = nn.Identity()
         self.rotary_emb = GlmRotaryEmbedding(

From c2f0a8dba9645e0f00b919903c2eafe5dd863146 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 16:52:18 +0200
Subject: [PATCH 29/39] remove past_layer_norm (always true)

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 1 -
 src/transformers/models/glm/modular_glm.py               | 7 +------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 45dae5109c88..a6065a91e777 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -102,7 +102,6 @@ def convert_config(original_config: dict):
         max_position_embeddings=original_config.pop("seq_length"),
         rms_norm_eps=original_config.pop("layernorm_epsilon"),
         rope_theta=10000.0 * original_config.pop("rope_ratio", 1),
-        post_layer_norm=original_config.pop("post_layer_norm"),
         use_cache=original_config.pop("use_cache"),
         head_dim=original_config.pop("kv_channels"),
         attention_bias=original_config.pop("add_qkv_bias"),
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 9893cf9f4d39..b80de60476ac 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -67,7 +67,6 @@ def __init__(
         max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=0.00000015625,
-        post_layer_norm=True,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
@@ -98,7 +97,6 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
-        self.post_layer_norm = post_layer_norm
         self.use_cache = use_cache
         self.initializer_range = initializer_range
         self.use_cache = use_cache
@@ -559,10 +557,7 @@ def __init__(self, config: GlmConfig):
         self.layers = nn.ModuleList(
             [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        if config.post_layer_norm:
-            self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        else:
-            self.norm = nn.Identity()
+        self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = GlmRotaryEmbedding(
             dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
         )

From e25135267659349295b4a3a4b451fde2d9773d75 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 17:04:17 +0200
Subject: [PATCH 30/39] Update __init__.py

---
 src/transformers/models/glm/__init__.py | 48 ++++---------------------
 1 file changed, 7 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/glm/__init__.py b/src/transformers/models/glm/__init__.py
index 7e5fecb03080..250498caec02 100644
--- a/src/transformers/models/glm/__init__.py
+++ b/src/transformers/models/glm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,49 +13,15 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
 
-_import_structure = {
-    "configuration_glm": ["GlmConfig"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_glm"] = [
-        "GlmForCausalLM",
-        "GlmModel",
-        "GlmPreTrainedModel",
-        "GlmForSequenceClassification",
-        "GlmForTokenClassification",
-    ]
-
 if TYPE_CHECKING:
-    from .configuration_glm import GlmConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_glm import (
-            GlmForCausalLM,
-            GlmForSequenceClassification,
-            GlmForTokenClassification,
-            GlmModel,
-            GlmPreTrainedModel,
-        )
-
+    from .configuration_glm import *
+    from .modeling_glm import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
\ No newline at end of file

From 4fbcfce83a69e124487e3f8d090af6bf2c6c699b Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 17:33:17 +0200
Subject: [PATCH 31/39] Update config and license

---
 .../models/glm/configuration_glm.py           | 110 ++++++++++++++----
 src/transformers/models/glm/modeling_glm.py   |  35 ++----
 src/transformers/models/glm/modular_glm.py    |  38 ++----
 3 files changed, 109 insertions(+), 74 deletions(-)

diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
index ad7d83f75edb..6d914ecb7f90 100644
--- a/src/transformers/models/glm/configuration_glm.py
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -5,7 +5,7 @@
 #                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,7 +25,80 @@
 
 
 class GlmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GlmModel`]. It is used to instantiate an Glm
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Glm-7B.
+    e.g. [google/glm-7b](https://huggingface.co/google/glm-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Glm model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GlmModel`]
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        hidden_activation (`str` or `function`, *optional*):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import GlmModel, GlmConfig
+    >>> # Initializing a Glm glm-7b style configuration
+    >>> configuration = GlmConfig()
+    >>> # Initializing a model from the glm-7b style configuration
+    >>> model = GlmModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+        resid_pdrop (`float`, *optional*, defaults to `0.0`):
+            Dropout ratio in the decoder layers.
+        linear_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the MLP layers, as well as the query, key, value and output projection layers during self-attention.
+    """
+
     model_type = "glm"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
@@ -35,52 +108,45 @@ def __init__(
         num_hidden_layers=40,
         num_attention_heads=32,
         num_key_value_heads=2,
+        head_dim=128,
+        hidden_act="silu",
         resid_pdrop=0.0,
         attention_dropout=0.0,
-        hidden_act="silu",
         max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=0.00000015625,
-        use_rms_norm=True,
-        apply_residual_connection_post_layernorm=False,
-        post_layer_norm=True,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         pad_token_id=151329,
         eos_token_id=[151329, 151336, 151338],
         bos_token_id=None,
-        head_dim=128,
         attention_bias=True,
         linear_bias=False,
         **kwargs,
     ):
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
         self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
         self.num_key_value_heads = num_key_value_heads
-        self.resid_pdrop = resid_pdrop
-        self.attention_dropout = attention_dropout
         self.hidden_act = hidden_act
-        self.max_position_embeddings = max_position_embeddings
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
-        self.use_rms_norm = use_rms_norm
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
-        self.post_layer_norm = post_layer_norm
-        self.use_cache = use_cache
-        self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.rope_theta = rope_theta
-        self.head_dim = head_dim
         self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.resid_pdrop = resid_pdrop
         self.linear_bias = linear_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 8201101fbec6..284911e9f66b 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -5,7 +5,7 @@
 #                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -531,23 +531,14 @@ def __init__(self, config: GlmConfig, layer_idx: int):
         super().__init__()
 
         self.config = config
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
         self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
 
         self.mlp = GlmMLP(config)
-        self.input_layernorm = (
-            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            if config.use_rms_norm
-            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        )
+        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = (
-            GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            if config.use_rms_norm
-            else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        )
+        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -584,12 +575,12 @@ def forward(
                 into the model
         """
 
-        hidden_states_after_norm = self.input_layernorm(hidden_states)
-        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
 
         # Self Attention
         attn_outputs, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states_after_norm,
+            hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
@@ -600,11 +591,11 @@ def forward(
         )
 
         hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        residual = hidden_states
 
-        hidden_states_after_norm = self.post_attention_layernorm(hidden_states)
-        residual = hidden_states_after_norm if self.apply_residual_connection_post_layernorm else hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
 
-        hidden_states = self.mlp(hidden_states_after_norm)
+        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + self.resid_mlp_dropout(hidden_states)
 
         outputs = (hidden_states,)
@@ -767,14 +758,6 @@ def __init__(self, config: GlmConfig):
             dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
         )
         self.gradient_checkpointing = False
-        if config.post_layer_norm:
-            self.norm = (
-                GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-                if config.use_rms_norm
-                else nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-            )
-        else:
-            self.norm = nn.Identity()
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index b80de60476ac..ee89fead78d5 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,13 +21,13 @@
 import torch.utils.checkpoint
 
 from ...cache_utils import Cache
-from ...configuration_utils import PretrainedConfig
 from ...modeling_flash_attention_utils import _flash_attention_forward
 from ...utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
 )
+from ..gemma.configuration_gemma import GemmaConfig
 from ..gemma.modeling_gemma import (
     GemmaForCausalLM,
     GemmaForSequenceClassification,
@@ -50,7 +50,13 @@
 logger = logging.get_logger(__name__)
 
 
-class GlmConfig(PretrainedConfig):
+class GlmConfig(GemmaConfig):
+    """
+    resid_pdrop (`float`, *optional*, defaults to `0.0`):
+        Dropout ratio in the decoder layers.
+    linear_bias (`bool`, *optional*, defaults to `False`):
+        Whether to use a bias in the MLP layers, as well as the query, key, value and output projection layers during self-attention.
+    """
     model_type = "glm"
 
     def __init__(
@@ -61,9 +67,10 @@ def __init__(
         num_hidden_layers=40,
         num_attention_heads=32,
         num_key_value_heads=2,
+        head_dim=128,
+        hidden_act="silu",
         resid_pdrop=0.0,
         attention_dropout=0.0,
-        hidden_act="silu",
         max_position_embeddings=131072,
         initializer_range=0.02,
         rms_norm_eps=0.00000015625,
@@ -73,37 +80,16 @@ def __init__(
         pad_token_id=151329,
         eos_token_id=[151329, 151336, 151338],
         bos_token_id=None,
-        head_dim=128,
         attention_bias=True,
         linear_bias=False,
         **kwargs,
     ):
         super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
             **kwargs,
         )
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
         self.resid_pdrop = resid_pdrop
-        self.attention_dropout = attention_dropout
-        self.hidden_act = hidden_act
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.head_dim = head_dim
-        self.attention_bias = attention_bias
         self.linear_bias = linear_bias
+        del self.hidden_activation
 
 
 class GlmRMSNorm(Phi3RMSNorm):

From a1692ab2c19f01a1e577f94b1287c67c10008832 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 18:32:24 +0200
Subject: [PATCH 32/39] start adding tests and doc

---
 docs/source/en/model_doc/glm.md            |  99 ++++
 src/transformers/models/glm/modular_glm.py |   2 +-
 tests/models/glm/__init__.py               |   0
 tests/models/glm/test_modeling_glm.py      | 514 +++++++++++++++++++++
 4 files changed, 614 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/model_doc/glm.md
 create mode 100644 tests/models/glm/__init__.py
 create mode 100644 tests/models/glm/test_modeling_glm.py

diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md
new file mode 100644
index 000000000000..d54c37f66513
--- /dev/null
+++ b/docs/source/en/model_doc/glm.md
@@ -0,0 +1,99 @@
+<!--Copyright 2024 The GLM & ZhipuAI team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GLM
+
+## Overview
+
+The GLM Model was proposed
+in [ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools](https://arxiv.org/html/2406.12793v1)
+by GLM Team, THUDM & ZhipuAI.
+
+The abstract from the paper is the following:
+
+*We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report
+primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most
+capable models that are trained with all the insights and lessons gained from the preceding three generations of
+ChatGLM. To date, the GLM-4 models are pre-trained on ten trillions of tokens mostly in Chinese and English, along with
+a small set of corpus from 24 languages, and aligned primarily for Chinese and English usage. The high-quality alignment
+is achieved via a multi-stage post-training process, which involves supervised fine-tuning and learning from human
+feedback. Evaluations show that GLM-4 1) closely rivals or outperforms GPT-4 in terms of general metrics such as MMLU,
+GSM8K, MATH, BBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following as measured by IFEval, 3)
+matches GPT-4 Turbo (128K) and Claude 3 for long context tasks, and 4) outperforms GPT-4 in Chinese alignments as
+measured by AlignBench. The GLM-4 All Tools model is further aligned to understand user intent and autonomously decide
+when and which tool(s) to use—including web browser, Python interpreter, text-to-image model, and user-defined
+functions—to effectively complete complex tasks. In practical applications, it matches and even surpasses GPT-4 All
+Tools in tasks like accessing online information via web browsing and solving math problems using Python interpreter.
+Over the course, we have open-sourced a series of models, including ChatGLM-6B (three generations), GLM-4-9B (128K, 1M),
+GLM-4V-9B, WebGLM, and CodeGeeX, attracting over 10 million downloads on Hugging face in the year 2023 alone.*
+
+Tips:
+
+- This model was contributed by [THUDM](https://huggingface.co/THUDM). The most recent code can be
+  found [here](https://github.com/thudm/GLM-4).
+
+  
+## Usage tips
+
+`GLM-4` can be found on the [Huggingface Hub](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
+
+In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
+
+>>> prompt = "Give me a short introduction to large language model."
+
+>>> messages = [{"role": "user", "content": prompt}]
+
+>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+>>> model_inputs = tokenizer([text], return_tensors="pt").to(device)
+
+>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)
+
+>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
+
+>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
+
+## GLMConfig
+
+[[autodoc]] GLMConfig
+
+## GLMModel
+
+[[autodoc]] GLMModel
+    - forward
+
+## GLMForCausalLM
+
+[[autodoc]] GLMForCausalLM
+    - forward
+
+## GLMForSequenceClassification
+
+[[autodoc]] GLMForSequenceClassification
+    - forward
+
+## GLMForTokenClassification
+
+[[autodoc]] GLMForTokenClassification
+    - forward
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index ee89fead78d5..35deba39a483 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 The Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University and HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 TThe GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tests/models/glm/__init__.py b/tests/models/glm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
new file mode 100644
index 000000000000..b50c5b15a66d
--- /dev/null
+++ b/tests/models/glm/test_modeling_glm.py
@@ -0,0 +1,514 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Glm model."""
+
+import tempfile
+import unittest
+
+import pytest
+from packaging import version
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, GlmConfig, is_torch_available
+from transformers.testing_utils import (
+    is_flaky,
+    require_bitsandbytes,
+    require_flash_attn,
+    require_read_token,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_gpu,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        GlmForCausalLM,
+        GlmForSequenceClassification,
+        GlmForTokenClassification,
+        GlmModel,
+    )
+
+
+@require_torch
+class GlmModelTester:
+    config_class = GlmConfig
+    if is_torch_available():
+        model_class = GlmModel
+        for_causal_lm_class = GlmForCausalLM
+        for_sequence_class = GlmForSequenceClassification
+        for_token_class = GlmForTokenClassification
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        intermediate_size=37,
+        hidden_act="silu",
+        attention_dropout=0.1,
+        resid_pdrop=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.resid_pdrop = resid_pdrop
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+        self.head_dim = self.hidden_size // self.num_attention_heads
+
+    # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return self.config_class(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            attention_dropout=self.attention_dropout,
+            resid_pdrop=self.resid_pdrop,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            head_dim=self.head_dim,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = self.model_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = self.model_class(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = self.for_causal_lm_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = self.for_causal_lm_class(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Glm
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (GlmModel, GlmForCausalLM, GlmForSequenceClassification, GlmForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (GlmForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": GlmModel,
+            "text-classification": GlmForSequenceClassification,
+            "token-classification": GlmForTokenClassification,
+            "text-generation": GlmForCausalLM,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+
+    # Need to remove 0.9 in `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.6]
+
+    # used in `test_torch_compile`
+    _torch_compile_test_ckpt = "THUDM/glm-4-9b"
+
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = GlmModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_Glm_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        print(config)
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = self.model_tester.for_sequence_class(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Glm_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = self.model_tester.for_sequence_class(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Glm_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = self.model_tester.for_sequence_class(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Glm_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = self.model_tester.for_token_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
+    @unittest.skip(reason="Glm uses GQA on all models so the KV cache is a non standard format")
+    def test_past_key_values_format(self):
+        pass
+
+
+@slow
+@require_torch_accelerator
+class GlmIntegrationTest(unittest.TestCase):
+    input_text = ["Hello I am doing", "Hi today"]
+    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
+    # Depending on the hardware we get different logits / generations
+    cuda_compute_capability_major_version = None
+
+    @classmethod
+    def setUpClass(cls):
+        if is_torch_available() and torch.cuda.is_available():
+            # 8 is for A100 / A10 and 7 for T4
+            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
+
+    def test_model_9b_fp16(self):
+        model_id = "THUDM/glm-4-9b"
+        EXPECTED_TEXTS = [
+            'Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the',
+            'Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.'
+            ]
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
+            torch_device
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_9b_bf16(self):
+        model_id = "THUDM/glm-4-9b"
+
+        EXPECTED_TEXTS = [
+            'Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the',
+            'Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.'
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
+            torch_device
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_9b_eager(self):
+        model_id = "THUDM/glm-4-9b"
+
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
+        )
+        model.to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    @require_torch_sdpa
+    def test_model_9b_sdpa(self):
+        model_id = "THUDM/glm-4-9b"
+
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa"
+        )
+        model.to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    @require_flash_attn
+    @pytest.mark.flash_attn_test
+    def test_model_9b_flash_attn(self):
+        model_id = "THUDM/glm-4-9b"
+        EXPECTED_TEXTS = [
+            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
+        ]
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+        )
+        model.to(torch_device)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        self.assertEqual(output_text, EXPECTED_TEXTS)
\ No newline at end of file

From 3c802740108d34fe6e5d2387cd4c7dbbf418cb1e Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 19:15:27 +0200
Subject: [PATCH 33/39] Add doc + style

---
 docs/source/en/_toctree.yml                    |  2 ++
 docs/source/en/index.md                        |  1 +
 docs/source/en/perf_infer_gpu_one.md           |  2 ++
 src/transformers/models/glm/__init__.py        |  2 +-
 .../models/glm/configuration_glm.py            |  2 +-
 .../models/glm/convert_glm_weights_to_hf.py    |  5 ++---
 src/transformers/models/glm/modeling_glm.py    |  2 +-
 src/transformers/models/glm/modular_glm.py     |  3 ++-
 tests/models/glm/test_modeling_glm.py          | 18 ++++++------------
 9 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ae632376f946..36588a2fcd78 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -410,6 +410,8 @@
         title: Gemma
       - local: model_doc/gemma2
         title: Gemma2
+      - local: model_doc/glm
+        title: GLM
       - local: model_doc/openai-gpt
         title: GPT
       - local: model_doc/gpt_neo
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 0a5518fd71c8..7884913785b4 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -136,6 +136,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [ErnieM](model_doc/ernie_m)                        |       ✅        |         ❌         |      ❌      |
 |                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
 |              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
+|                           [GLM](model_doc/glm)                           |       ✅        |         ❌         |      ❌      |
 |                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
 |                  [FalconMamba](model_doc/falcon_mamba)                   |       ✅        |         ❌         |      ❌      |
 |         [FastSpeech2Conformer](model_doc/fastspeech2_conformer)          |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 346759aa2b25..577e39e88567 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -42,6 +42,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
@@ -214,6 +215,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel)
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
+* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
diff --git a/src/transformers/models/glm/__init__.py b/src/transformers/models/glm/__init__.py
index 250498caec02..7e0a5d6cc287 100644
--- a/src/transformers/models/glm/__init__.py
+++ b/src/transformers/models/glm/__init__.py
@@ -24,4 +24,4 @@
     import sys
 
     _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
\ No newline at end of file
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
index 6d914ecb7f90..2716d0068522 100644
--- a/src/transformers/models/glm/configuration_glm.py
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -5,7 +5,7 @@
 #                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 The Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University and HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 TThe GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index a6065a91e777..800270b96da7 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -3,7 +3,7 @@
 import os
 
 import torch
-from safetensors.torch import save_file, load_file
+from safetensors.torch import load_file
 from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
 
 from transformers import AutoTokenizer, GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast
@@ -62,9 +62,8 @@ def merge_safetensors(input_dir: str):
     for file in all_files:
         tensors = load_file(file)
         all_weights.update(tensors)
-    
+
     return all_weights
-    
 
 
 def convert_state_dict(original_state_dict: dict):
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 284911e9f66b..46aa4729ba9f 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -5,7 +5,7 @@
 #                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 The Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University and HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 TThe GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py
index 35deba39a483..ba3af15eec05 100644
--- a/src/transformers/models/glm/modular_glm.py
+++ b/src/transformers/models/glm/modular_glm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 TThe GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -57,6 +57,7 @@ class GlmConfig(GemmaConfig):
     linear_bias (`bool`, *optional*, defaults to `False`):
         Whether to use a bias in the MLP layers, as well as the query, key, value and output projection layers during self-attention.
     """
+
     model_type = "glm"
 
     def __init__(
diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
index b50c5b15a66d..c6c7c245054f 100644
--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@@ -14,21 +14,15 @@
 # limitations under the License.
 """Testing suite for the PyTorch Glm model."""
 
-import tempfile
 import unittest
 
 import pytest
-from packaging import version
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GlmConfig, is_torch_available
 from transformers.testing_utils import (
-    is_flaky,
-    require_bitsandbytes,
     require_flash_attn,
-    require_read_token,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     require_torch_sdpa,
     slow,
     torch_device,
@@ -412,9 +406,9 @@ def setUpClass(cls):
     def test_model_9b_fp16(self):
         model_id = "THUDM/glm-4-9b"
         EXPECTED_TEXTS = [
-            'Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the',
-            'Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.'
-            ]
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
+        ]
 
         model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to(
             torch_device
@@ -432,8 +426,8 @@ def test_model_9b_bf16(self):
         model_id = "THUDM/glm-4-9b"
 
         EXPECTED_TEXTS = [
-            'Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the',
-            'Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.'
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
         ]
 
         model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
@@ -511,4 +505,4 @@ def test_model_9b_flash_attn(self):
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
         output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
 
-        self.assertEqual(output_text, EXPECTED_TEXTS)
\ No newline at end of file
+        self.assertEqual(output_text, EXPECTED_TEXTS)

From 73ecb143065ed516d5ffe64f4818b54c192f7de4 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 19:21:41 +0200
Subject: [PATCH 34/39] Update test_modeling_glm.py

---
 tests/models/glm/test_modeling_glm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
index c6c7c245054f..57be6d28bfe2 100644
--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@@ -446,8 +446,8 @@ def test_model_9b_eager(self):
         model_id = "THUDM/glm-4-9b"
 
         EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
         ]
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -468,8 +468,8 @@ def test_model_9b_sdpa(self):
         model_id = "THUDM/glm-4-9b"
 
         EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
         ]
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -490,8 +490,8 @@ def test_model_9b_sdpa(self):
     def test_model_9b_flash_attn(self):
         model_id = "THUDM/glm-4-9b"
         EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
+            "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the",
+            "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.",
         ]
 
         model = AutoModelForCausalLM.from_pretrained(

From d15d0e5353fa3cc8248d088aa59332d13e140ae8 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Fri, 27 Sep 2024 19:25:57 +0200
Subject: [PATCH 35/39] Add dummies

---
 src/transformers/utils/dummy_pt_objects.py | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f4e471ee7ab5..0de12317fe7a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1404,6 +1404,41 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GLMForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GLMForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GLMForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GLMModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GLMPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BigBirdPegasusForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 

From eacc0adf0909406bd38d05409c5fa41386b8e06b Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Mon, 30 Sep 2024 12:05:19 +0200
Subject: [PATCH 36/39] Update back init (because __all__ is not generated from
 modular)

---
 src/transformers/__init__.py            |  1 -
 src/transformers/models/glm/__init__.py | 47 +++++++++++++++++++++----
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 95ec1d874227..0f4565508358 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2553,7 +2553,6 @@
     _import_structure["models.glm"].extend(
         [
             "GlmForCausalLM",
-            "GlmForQuestionAnswering",
             "GlmForSequenceClassification",
             "GlmForTokenClassification",
             "GlmModel",
diff --git a/src/transformers/models/glm/__init__.py b/src/transformers/models/glm/__init__.py
index 7e0a5d6cc287..263f43a5ff9c 100644
--- a/src/transformers/models/glm/__init__.py
+++ b/src/transformers/models/glm/__init__.py
@@ -10,18 +10,51 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import _LazyModule
-from ...utils.import_utils import define_import_structure
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
 
 
+_import_structure = {
+    "configuration_glm": ["GlmConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_glm"] = [
+        "GlmForCausalLM",
+        "GlmModel",
+        "GlmPreTrainedModel",
+        "GlmForSequenceClassification",
+        "GlmForTokenClassification",
+    ]
+
 if TYPE_CHECKING:
-    from .configuration_glm import *
-    from .modeling_glm import *
+    from .configuration_glm import GlmConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_glm import (
+            GlmForCausalLM,
+            GlmForSequenceClassification,
+            GlmForTokenClassification,
+            GlmModel,
+            GlmPreTrainedModel,
+        )
+
 else:
     import sys
 
-    _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
\ No newline at end of file

From 499f7a53d15d6b1f1acc261f83dcc26de2e9a9cb Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Mon, 30 Sep 2024 12:10:06 +0200
Subject: [PATCH 37/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 800270b96da7..7559b99e2d2c 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -107,6 +107,7 @@ def convert_config(original_config: dict):
         linear_bias=original_config.pop("add_bias_linear"),
         eos_token_id=original_config.pop("eos_token_id"),
         pad_token_id=original_config.pop("pad_token_id"),
+        tie_word_embeddings=original_config.pop("tie_word_embeddings"),
     )
     print(f"Unused config keys: {original_config.keys(),}")
     return new_config

From 65085d2fb29ed3f1209e915c43803c989dfb5798 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Mon, 30 Sep 2024 14:13:00 +0200
Subject: [PATCH 38/39] Update convert_glm_weights_to_hf.py

---
 src/transformers/models/glm/convert_glm_weights_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
index 7559b99e2d2c..ef5eda117593 100644
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -56,7 +56,7 @@ def converted(self) -> Tokenizer:
 
 def merge_safetensors(input_dir: str):
     all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
-    all_files = sorted(all_files, key=lambda x: int(x.split("-", 2)[1]))
+    all_files = sorted(all_files, key=lambda x: int(x.rsplit("-", 3)[1]))
 
     all_weights = {}
     for file in all_files:

From d273a11bfe85d8a750d39ab53e9c7acaf082f4d1 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Mon, 30 Sep 2024 16:21:28 +0200
Subject: [PATCH 39/39] apply corrected modular

---
 .../models/glm/configuration_glm.py           |  2 +-
 src/transformers/models/glm/modeling_glm.py   | 22 +++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
index 2716d0068522..02041feaafd6 100644
--- a/src/transformers/models/glm/configuration_glm.py
+++ b/src/transformers/models/glm/configuration_glm.py
@@ -5,7 +5,7 @@
 #                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 TThe GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 46aa4729ba9f..91909da2b563 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -5,7 +5,7 @@
 #                           modular_xxx.py file directly. One of our CI enforces this
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 TThe GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -164,9 +164,9 @@ def forward(self, x, position_ids, seq_len=None):
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
 
 
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
@@ -191,8 +191,22 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
+
+    # Interleave them instead of usual shape
+    cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
+    sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
+
+    # Keep half for later concatenation
+    q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :]
+    k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :]
+
+    # Apply rotary embeddings on the first half
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
     return q_embed, k_embed