From 977338253f1490de3cf2f353b66b44b936a24150 Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Fri, 16 Aug 2024 16:59:47 -0700
Subject: [PATCH 01/21] add liger integration

---
 src/transformers/trainer.py       | 11 ++++++++++-
 src/transformers/training_args.py | 12 ++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4bd2e1ef5c82b7..ebe0a192a1c81f 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -35,7 +35,7 @@
 from collections.abc import Mapping
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-
+from liger_kernel.transformers import MODEL_TO_LIGER_KERNEL_PATCHING_FUNC
 
 # Integrations must be imported before ML frameworks:
 # isort: off
@@ -463,6 +463,15 @@ def __init__(
                     " to `True` to avoid any unexpected behavior such as device placement mismatching."
                 )
 
+        if self.args.use_liger:
+            if model.__class__.__name__ not in MODEL_TO_LIGER_KERNEL_PATCHING_FUNC:
+                raise ValueError(
+                    "The model you have picked ({model.__class__.__name__}) cannot be used with Liger kernels, "
+                    f"a list of supported model classes are: {MODEL_TO_LIGER_KERNEL_PATCHING_FUNC.keys()}"
+                )
+            # monkey patch the model with liger kernels
+            MODEL_TO_LIGER_KERNEL_PATCHING_FUNC[model.__class__.__name__](model)
+
         _is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr(
             model, "_hf_peft_config_loaded", False
         )
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index ca6f32279fa422..c780576ab7265e 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -791,6 +791,11 @@ class TrainingArguments:
 
         eval_use_gather_object (`bool`, *optional*, defaults to `False`):
             Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices. This should only be enabled if users are not just returning tensors, and this is actively discouraged by PyTorch.
+
+        use_liger (`bool`, *optional*, defaults to `False`):
+            Whether enable [Liger](https://github.com/linkedin/Liger-Kernel) (Linkedin GPU Efficient Runtime) Kernel for LLM model training.
+            It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with 
+            flash attention, PyTorch FSDP, and Microsoft DeepSpeed. Currently, it supports llama, mistral, mixtral and gemma models.
     """
 
     framework = "pt"
@@ -1490,6 +1495,13 @@ class TrainingArguments:
             "help": "Whether to run through the entire `evaluation` step at the very beginning of training as a sanity check."
         },
     )
+    
+    use_liger: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to enable the Liger (Linkedin GPU Efficient Runtime) Kernel for model training."
+        },
+    )
 
     eval_use_gather_object: Optional[bool] = field(
         default=False,

From e94e09a0649a6b4e42958b1b66fba96daed1d09b Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Fri, 16 Aug 2024 17:44:58 -0700
Subject: [PATCH 02/21] fix syntax

---
 src/transformers/trainer.py       | 2 ++
 src/transformers/training_args.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index ebe0a192a1c81f..b3ca282cd828dc 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -35,8 +35,10 @@
 from collections.abc import Mapping
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
 from liger_kernel.transformers import MODEL_TO_LIGER_KERNEL_PATCHING_FUNC
 
+
 # Integrations must be imported before ML frameworks:
 # isort: off
 from .integrations import (
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index c780576ab7265e..cde23ae1e6672a 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -794,7 +794,7 @@ class TrainingArguments:
 
         use_liger (`bool`, *optional*, defaults to `False`):
             Whether enable [Liger](https://github.com/linkedin/Liger-Kernel) (Linkedin GPU Efficient Runtime) Kernel for LLM model training.
-            It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with 
+            It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with
             flash attention, PyTorch FSDP, and Microsoft DeepSpeed. Currently, it supports llama, mistral, mixtral and gemma models.
     """
 
@@ -1495,7 +1495,7 @@ class TrainingArguments:
             "help": "Whether to run through the entire `evaluation` step at the very beginning of training as a sanity check."
         },
     )
-    
+
     use_liger: Optional[bool] = field(
         default=False,
         metadata={

From 20c78c839a6f7e88cd97f53a68b92263963d03db Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Fri, 16 Aug 2024 17:55:31 -0700
Subject: [PATCH 03/21] fix import issue

---
 src/transformers/trainer.py            | 22 ++++++++++++++--------
 src/transformers/utils/__init__.py     |  1 +
 src/transformers/utils/import_utils.py |  5 +++++
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index b3ca282cd828dc..41d536709f0df4 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -36,8 +36,6 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
-from liger_kernel.transformers import MODEL_TO_LIGER_KERNEL_PATCHING_FUNC
-
 
 # Integrations must be imported before ML frameworks:
 # isort: off
@@ -157,6 +155,7 @@
     is_grokadamw_available,
     is_in_notebook,
     is_ipex_available,
+    is_liger_kernel_available,
     is_lomo_available,
     is_peft_available,
     is_safetensors_available,
@@ -466,13 +465,20 @@ def __init__(
                 )
 
         if self.args.use_liger:
-            if model.__class__.__name__ not in MODEL_TO_LIGER_KERNEL_PATCHING_FUNC:
-                raise ValueError(
-                    "The model you have picked ({model.__class__.__name__}) cannot be used with Liger kernels, "
-                    f"a list of supported model classes are: {MODEL_TO_LIGER_KERNEL_PATCHING_FUNC.keys()}"
+            if is_liger_kernel_available():
+                from liger_kernel.transformers import MODEL_TO_LIGER_KERNEL_PATCHING_FUNC
+
+                if model.__class__.__name__ not in MODEL_TO_LIGER_KERNEL_PATCHING_FUNC:
+                    raise ValueError(
+                        "The model you have picked ({model.__class__.__name__}) cannot be used with Liger kernels, "
+                        f"a list of supported model classes are: {MODEL_TO_LIGER_KERNEL_PATCHING_FUNC.keys()}"
+                    )
+                # monkey patch the model with liger kernels
+                MODEL_TO_LIGER_KERNEL_PATCHING_FUNC[model.__class__.__name__](model)
+            else:
+                raise ImportError(
+                    "You have set `use_liger` to `True` but Liger kernel is not available. Please install Liger kernel"
                 )
-            # monkey patch the model with liger kernels
-            MODEL_TO_LIGER_KERNEL_PATCHING_FUNC[model.__class__.__name__](model)
 
         _is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr(
             model, "_hf_peft_config_loaded", False
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 4df06118be3e97..54f1d6d85139fa 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -148,6 +148,7 @@
     is_keras_nlp_available,
     is_levenshtein_available,
     is_librosa_available,
+    is_liger_kernel_avaiable,
     is_lomo_available,
     is_mlx_available,
     is_natten_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 6840921ddc43f7..81e3f662a0e5b5 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -177,6 +177,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _torchvision_available = _is_package_available("torchvision")
 _mlx_available = _is_package_available("mlx")
 _hqq_available = _is_package_available("hqq")
+_liger_kernel_available = _is_package_available("liger_kernel")
 
 
 _torch_version = "N/A"
@@ -1164,6 +1165,10 @@ def is_mlx_available():
     return _mlx_available
 
 
+def is_liger_kernel_avaiable():
+    return _liger_kernel_available
+
+
 # docstyle-ignore
 AV_IMPORT_ERROR = """
 {0} requires the PyAv library but it was not found in your environment. You can install it with:

From f44157f31a8bda9a147fd653a9c11004da03518f Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Mon, 19 Aug 2024 08:56:47 -0700
Subject: [PATCH 04/21] add trainer.md

---
 docs/source/en/trainer.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 37d8baf3d7ec4c..97aa07b2c8521c 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -382,6 +382,41 @@ trainer.train()
 
 Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue.
 
+## Liger Kernel
+
+[Liger](https://github.com/linkedin/Liger-Kernel) (Linkedin GPU Efficient Runtime) Kernel is a collection of Triton kernels designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
+
+<Tip>
+Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)
+</Tip>
+
+First make sure to install Liger official repository:
+```bash
+pip install liger-kernel
+```
+
+You should pass `use_liger=True` to apply liger kenel on your model, for example:
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger=True
+)
+```
+
+Currently, the kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. A list of supported models is defined [here](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/monkey_patch.py#L7). When use_liger is set to True, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
+
 ## LOMO optimizer
 
 The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195). 

From f4e97475542a6e1408e488cdd0a6fd0c93cd0e06 Mon Sep 17 00:00:00 2001
From: shimizust <sshimizu@linkedin.com>
Date: Mon, 19 Aug 2024 23:24:48 +0000
Subject: [PATCH 05/21] Use _apply_liger_kernel()

---
 src/transformers/trainer.py            | 20 ++++++++++----------
 src/transformers/utils/__init__.py     |  2 +-
 src/transformers/utils/import_utils.py |  7 +++++--
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 41d536709f0df4..a611998f2a92f5 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -466,18 +466,18 @@ def __init__(
 
         if self.args.use_liger:
             if is_liger_kernel_available():
-                from liger_kernel.transformers import MODEL_TO_LIGER_KERNEL_PATCHING_FUNC
-
-                if model.__class__.__name__ not in MODEL_TO_LIGER_KERNEL_PATCHING_FUNC:
-                    raise ValueError(
-                        "The model you have picked ({model.__class__.__name__}) cannot be used with Liger kernels, "
-                        f"a list of supported model classes are: {MODEL_TO_LIGER_KERNEL_PATCHING_FUNC.keys()}"
-                    )
-                # monkey patch the model with liger kernels
-                MODEL_TO_LIGER_KERNEL_PATCHING_FUNC[model.__class__.__name__](model)
+                from liger_kernel.transformers.trainer_integration import _apply_liger_kernel
+                
+                model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None):
+                if model_type:
+                    # Monkey patch the model with liger kernels. Use the default kernel configurations.
+                    _apply_liger_kernel(model_type=model_type)
+                else:
+                    logger.info("The model does not have a valid `model_type` specified.")
             else:
                 raise ImportError(
-                    "You have set `use_liger` to `True` but Liger kernel is not available. Please install Liger kernel"
+                    "You have set `use_liger` to `True` but liger-kernel >= 0.1.0 is not available. "
+                    "Please install it with `pip install liger-kernel`"
                 )
 
         _is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr(
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 54f1d6d85139fa..56f594da15f122 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -148,7 +148,7 @@
     is_keras_nlp_available,
     is_levenshtein_available,
     is_librosa_available,
-    is_liger_kernel_avaiable,
+    is_liger_kernel_available,
     is_lomo_available,
     is_mlx_available,
     is_natten_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 81e3f662a0e5b5..09e9e4e0fb679b 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1165,8 +1165,11 @@ def is_mlx_available():
     return _mlx_available
 
 
-def is_liger_kernel_avaiable():
-    return _liger_kernel_available
+def is_liger_kernel_available():
+    if not _liger_kernel_available:
+        return False
+    
+    return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse("0.1.0")
 
 
 # docstyle-ignore

From 38e2acd2704f13ed7d6a48fd84fe9f54ca1411f1 Mon Sep 17 00:00:00 2001
From: shimizust <sshimizu@linkedin.com>
Date: Mon, 19 Aug 2024 23:42:33 +0000
Subject: [PATCH 06/21] Fixed log message

---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index a611998f2a92f5..e4b7b59939c61e 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -473,7 +473,7 @@ def __init__(
                     # Monkey patch the model with liger kernels. Use the default kernel configurations.
                     _apply_liger_kernel(model_type=model_type)
                 else:
-                    logger.info("The model does not have a valid `model_type` specified.")
+                    logger.info("The model does not have a valid `model_type` specified. No liger kernels will be applied.")
             else:
                 raise ImportError(
                     "You have set `use_liger` to `True` but liger-kernel >= 0.1.0 is not available. "

From f27fdced9c77076e8dff270f69f2f4db05c8088a Mon Sep 17 00:00:00 2001
From: Steven Shimizu <shimizust@gmail.com>
Date: Tue, 20 Aug 2024 09:35:01 -0700
Subject: [PATCH 07/21] Update docs/source/en/trainer.md

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 docs/source/en/trainer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 97aa07b2c8521c..948ad5603437ea 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -415,7 +415,7 @@ training_args = TrainingArguments(
 )
 ```
 
-Currently, the kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. A list of supported models is defined [here](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/monkey_patch.py#L7). When use_liger is set to True, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
+Currently, the kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. A list of supported models is defined [here](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/monkey_patch.py#L7). When `use_liger` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
 
 ## LOMO optimizer
 

From a74ca2482b2d896b99840ebf25cb941be1180f64 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <shimizust@gmail.com>
Date: Tue, 20 Aug 2024 09:35:15 -0700
Subject: [PATCH 08/21] Update docs/source/en/trainer.md

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 docs/source/en/trainer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 948ad5603437ea..be996eaa9a1922 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -395,7 +395,7 @@ First make sure to install Liger official repository:
 pip install liger-kernel
 ```
 
-You should pass `use_liger=True` to apply liger kenel on your model, for example:
+You should pass `use_liger=True` to apply liger kernel on your model, for example:
 
 ```py
 from transformers import TrainingArguments

From d3d29f43966c59108565a102c55ab4086bb5c05c Mon Sep 17 00:00:00 2001
From: Steven Shimizu <shimizust@gmail.com>
Date: Tue, 20 Aug 2024 09:35:35 -0700
Subject: [PATCH 09/21] Update src/transformers/training_args.py

Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index cde23ae1e6672a..435eaa2da14183 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1499,7 +1499,7 @@ class TrainingArguments:
     use_liger: Optional[bool] = field(
         default=False,
         metadata={
-            "help": "Whether or not to enable the Liger (Linkedin GPU Efficient Runtime) Kernel for model training."
+            "help": "Whether or not to enable the Liger Kernel for model training."
         },
     )
 

From 29b13a9f2dee507b84afa96a587584e5cbd94c98 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <shimizust@gmail.com>
Date: Tue, 20 Aug 2024 09:35:43 -0700
Subject: [PATCH 10/21] Update src/transformers/trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index e4b7b59939c61e..b528e712a58ccb 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -473,7 +473,7 @@ def __init__(
                     # Monkey patch the model with liger kernels. Use the default kernel configurations.
                     _apply_liger_kernel(model_type=model_type)
                 else:
-                    logger.info("The model does not have a valid `model_type` specified. No liger kernels will be applied.")
+                    logger.warning("The model does not have a valid `model_type` specified. No liger kernels will be applied.")
             else:
                 raise ImportError(
                     "You have set `use_liger` to `True` but liger-kernel >= 0.1.0 is not available. "

From f0b2125d77a752f308f7f79dd6eff2ae9b098eb5 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <shimizust@gmail.com>
Date: Tue, 20 Aug 2024 09:35:53 -0700
Subject: [PATCH 11/21] Update src/transformers/training_args.py

Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 435eaa2da14183..ca7a6d806bba23 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -793,7 +793,7 @@ class TrainingArguments:
             Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices. This should only be enabled if users are not just returning tensors, and this is actively discouraged by PyTorch.
 
         use_liger (`bool`, *optional*, defaults to `False`):
-            Whether enable [Liger](https://github.com/linkedin/Liger-Kernel) (Linkedin GPU Efficient Runtime) Kernel for LLM model training.
+            Whether enable [Liger](https://github.com/linkedin/Liger-Kernel) Kernel for LLM model training.
             It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with
             flash attention, PyTorch FSDP, and Microsoft DeepSpeed. Currently, it supports llama, mistral, mixtral and gemma models.
     """

From 86396296dd5130dbc0b920efd2e59dfad219d703 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <shimizust@gmail.com>
Date: Tue, 20 Aug 2024 09:36:34 -0700
Subject: [PATCH 12/21] Update docs/source/en/trainer.md

Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
---
 docs/source/en/trainer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index be996eaa9a1922..45ee8d007d6391 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -384,7 +384,7 @@ Note layerwise optimization is a bit experimental and does not support DDP (Dist
 
 ## Liger Kernel
 
-[Liger](https://github.com/linkedin/Liger-Kernel) (Linkedin GPU Efficient Runtime) Kernel is a collection of Triton kernels designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
+[Liger](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
 
 <Tip>
 Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)

From 2d7c4ab60826bd4409ce6cde7648987b3b3885c4 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <sshimizu@linkedin.com>
Date: Tue, 20 Aug 2024 10:37:21 -0700
Subject: [PATCH 13/21] Fixed checkstyle and updated readme

---
 docs/source/en/trainer.md              | 2 +-
 src/transformers/trainer.py            | 8 +++++---
 src/transformers/training_args.py      | 4 +---
 src/transformers/utils/import_utils.py | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 45ee8d007d6391..b13a0eef39bbfc 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -415,7 +415,7 @@ training_args = TrainingArguments(
 )
 ```
 
-Currently, the kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. A list of supported models is defined [here](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/monkey_patch.py#L7). When `use_liger` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
+The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
 
 ## LOMO optimizer
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index b528e712a58ccb..4dc8cc096ebaf3 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -467,13 +467,15 @@ def __init__(
         if self.args.use_liger:
             if is_liger_kernel_available():
                 from liger_kernel.transformers.trainer_integration import _apply_liger_kernel
-                
-                model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None):
+
+                model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None)
                 if model_type:
                     # Monkey patch the model with liger kernels. Use the default kernel configurations.
                     _apply_liger_kernel(model_type=model_type)
                 else:
-                    logger.warning("The model does not have a valid `model_type` specified. No liger kernels will be applied.")
+                    logger.warning(
+                        "The model does not have a valid `model_type` specified. No liger kernels will be applied."
+                    )
             else:
                 raise ImportError(
                     "You have set `use_liger` to `True` but liger-kernel >= 0.1.0 is not available. "
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index ca7a6d806bba23..b7c7fb3ab2ed7c 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1498,9 +1498,7 @@ class TrainingArguments:
 
     use_liger: Optional[bool] = field(
         default=False,
-        metadata={
-            "help": "Whether or not to enable the Liger Kernel for model training."
-        },
+        metadata={"help": "Whether or not to enable the Liger Kernel for model training."},
     )
 
     eval_use_gather_object: Optional[bool] = field(
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 09e9e4e0fb679b..03627b22c959e1 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1168,7 +1168,7 @@ def is_mlx_available():
 def is_liger_kernel_available():
     if not _liger_kernel_available:
         return False
-    
+
     return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse("0.1.0")
 
 
From e51eb93190d7f398e9abce8eab28922b08196f15 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <sshimizu@linkedin.com>
Date: Tue, 20 Aug 2024 21:10:22 +0000
Subject: [PATCH 14/21] Added test

---
 src/transformers/testing_utils.py |  8 ++++++++
 tests/trainer/test_trainer.py     | 20 ++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 8e234d7d1c6dc5..3d30c9ff647980 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -84,6 +84,7 @@
     is_keras_nlp_available,
     is_levenshtein_available,
     is_librosa_available,
+    is_liger_kernel_available,
     is_lomo_available,
     is_natten_available,
     is_nltk_available,
@@ -1162,6 +1163,13 @@ def require_librosa(test_case):
     return unittest.skipUnless(is_librosa_available(), "test requires librosa")(test_case)
 
 
+def require_liger_kernel(test_case):
+    """
+    Decorator marking a test that requires liger_kernel
+    """
+    return unittest.skipUnless(is_liger_kernel_available(), "test requires liger_kernel")(test_case)
+
+
 def require_essentia(test_case):
     """
     Decorator marking a test that requires essentia
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index ca133a277c41b5..cc0ab6d8013152 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -65,6 +65,7 @@
     require_grokadamw,
     require_intel_extension_for_pytorch,
     require_lomo,
+    require_liger_kernel,
     require_optuna,
     require_peft,
     require_ray,
@@ -1158,6 +1159,7 @@ def test_neftune(self):
 
         self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")
 
+
     def test_logging_inf_nan_filter(self):
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
@@ -1324,6 +1326,24 @@ def test_get_eval_dataloader_with_persistent_workers(self):
         self.assertEqual(first_dataloader, first_dataloader_repeated)
         self.assertEqual(second_dataloader, second_dataloader_repeated)
 
+    @require_liger_kernel
+    def test_apply_liger_kernel(self):
+        # Test that the model code actually gets patched with Liger kernel
+        from transformers.models.llama import modeling_llama
+        from liger_kernel.transformers.rms_norm import LigerRMSNorm
+
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_model = LlamaForCausalLM(config)
+
+        args = TrainingArguments(
+            "./test",
+            use_liger=True,
+        )
+        Trainer(tiny_model, args)
+
+        # Check that one of the Llama model layers has been correctly patched with Liger kernel
+        self.assertEqual(modeling_llama.LlamaRMSNorm, LigerRMSNorm)
+
     @require_lomo
     @require_torch_gpu
     def test_lomo(self):

From c286e1661ae32f0e1c16c28c4f987d5e824c0e5e Mon Sep 17 00:00:00 2001
From: Steven Shimizu <sshimizu@linkedin.com>
Date: Tue, 20 Aug 2024 21:32:20 +0000
Subject: [PATCH 15/21] Fixed checkstyle

---
 tests/trainer/test_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index cc0ab6d8013152..8d0cdc23817d32 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -64,8 +64,8 @@
     require_galore_torch,
     require_grokadamw,
     require_intel_extension_for_pytorch,
-    require_lomo,
     require_liger_kernel,
+    require_lomo,
     require_optuna,
     require_peft,
     require_ray,
@@ -1159,7 +1159,6 @@ def test_neftune(self):
 
         self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")
 
-
     def test_logging_inf_nan_filter(self):
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
@@ -1329,9 +1328,10 @@ def test_get_eval_dataloader_with_persistent_workers(self):
     @require_liger_kernel
     def test_apply_liger_kernel(self):
         # Test that the model code actually gets patched with Liger kernel
-        from transformers.models.llama import modeling_llama
         from liger_kernel.transformers.rms_norm import LigerRMSNorm
 
+        from transformers.models.llama import modeling_llama
+
         config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
         tiny_model = LlamaForCausalLM(config)
 

From fc05ba6c8a5a35738eae841963d1ad77a07f856e Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Tue, 20 Aug 2024 15:15:20 -0700
Subject: [PATCH 16/21] fix docstring

---
 docs/source/en/trainer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index b13a0eef39bbfc..0164291850d80a 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -384,7 +384,7 @@ Note layerwise optimization is a bit experimental and does not support DDP (Dist
 
 ## Liger Kernel
 
-[Liger](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
+[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
 
 <Tip>
 Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)

From b2bae31a82f2f0457e26a2700eb7c97f6b0316c9 Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Tue, 20 Aug 2024 15:19:11 -0700
Subject: [PATCH 17/21] rename use_liger to use_liger_kernel

---
 docs/source/en/trainer.md         | 6 +++---
 src/transformers/trainer.py       | 4 ++--
 src/transformers/training_args.py | 4 ++--
 tests/trainer/test_trainer.py     | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 0164291850d80a..b5a89c55c96af7 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -395,7 +395,7 @@ First make sure to install Liger official repository:
 pip install liger-kernel
 ```
 
-You should pass `use_liger=True` to apply liger kernel on your model, for example:
+You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example:
 
 ```py
 from transformers import TrainingArguments
@@ -411,11 +411,11 @@ training_args = TrainingArguments(
     save_strategy="epoch",
     load_best_model_at_end=True,
     push_to_hub=True,
-    use_liger=True
+    use_liger_kernel=True
 )
 ```
 
-The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
+The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
 
 ## LOMO optimizer
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4dc8cc096ebaf3..5b7b416e1f8d78 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -464,7 +464,7 @@ def __init__(
                     " to `True` to avoid any unexpected behavior such as device placement mismatching."
                 )
 
-        if self.args.use_liger:
+        if self.args.use_liger_kernel:
             if is_liger_kernel_available():
                 from liger_kernel.transformers.trainer_integration import _apply_liger_kernel
 
@@ -478,7 +478,7 @@ def __init__(
                     )
             else:
                 raise ImportError(
-                    "You have set `use_liger` to `True` but liger-kernel >= 0.1.0 is not available. "
+                    "You have set `use_liger_kernel` to `True` but liger-kernel >= 0.1.0 is not available. "
                     "Please install it with `pip install liger-kernel`"
                 )
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index b7c7fb3ab2ed7c..1bfcebf3afbe16 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -792,7 +792,7 @@ class TrainingArguments:
         eval_use_gather_object (`bool`, *optional*, defaults to `False`):
             Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices. This should only be enabled if users are not just returning tensors, and this is actively discouraged by PyTorch.
 
-        use_liger (`bool`, *optional*, defaults to `False`):
+        use_liger_kernel (`bool`, *optional*, defaults to `False`):
             Whether enable [Liger](https://github.com/linkedin/Liger-Kernel) Kernel for LLM model training.
             It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with
             flash attention, PyTorch FSDP, and Microsoft DeepSpeed. Currently, it supports llama, mistral, mixtral and gemma models.
@@ -1496,7 +1496,7 @@ class TrainingArguments:
         },
     )
 
-    use_liger: Optional[bool] = field(
+    use_liger_kernel: Optional[bool] = field(
         default=False,
         metadata={"help": "Whether or not to enable the Liger Kernel for model training."},
     )
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 8d0cdc23817d32..90cf7d61f49c60 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1337,7 +1337,7 @@ def test_apply_liger_kernel(self):
 
         args = TrainingArguments(
             "./test",
-            use_liger=True,
+            use_liger_kernel=True,
         )
         Trainer(tiny_model, args)
 

From d0b4be42d68f19584b49fd7136d16e8e6499bc0e Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Tue, 20 Aug 2024 16:23:36 -0700
Subject: [PATCH 18/21] Trigger Build


From 59a900b2b787f0243ea235e4a65ba3707f97ed86 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <sshimizu@linkedin.com>
Date: Wed, 21 Aug 2024 21:23:13 +0000
Subject: [PATCH 19/21] Added test

---
 tests/trainer/test_trainer.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 90cf7d61f49c60..6d7118568af895 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1326,24 +1326,41 @@ def test_get_eval_dataloader_with_persistent_workers(self):
         self.assertEqual(second_dataloader, second_dataloader_repeated)
 
     @require_liger_kernel
-    def test_apply_liger_kernel(self):
+    def test_use_liger_kernel_patching(self):
         # Test that the model code actually gets patched with Liger kernel
         from liger_kernel.transformers.rms_norm import LigerRMSNorm
 
         from transformers.models.llama import modeling_llama
 
         config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
-        tiny_model = LlamaForCausalLM(config)
+        tiny_llama = LlamaForCausalLM(config)
 
         args = TrainingArguments(
             "./test",
             use_liger_kernel=True,
         )
-        Trainer(tiny_model, args)
+        Trainer(tiny_llama, args)
 
         # Check that one of the Llama model layers has been correctly patched with Liger kernel
         self.assertEqual(modeling_llama.LlamaRMSNorm, LigerRMSNorm)
 
+    @require_liger_kernel
+    @require_torch_gpu
+    def test_use_liger_kernel_trainer(self):
+        # Check that trainer still works with liger kernel applied
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True)
+            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+            # Check this works
+            _ = trainer.train()
+
     @require_lomo
     @require_torch_gpu
     def test_lomo(self):

From 62eff43f60eb3017d6d1ba59650d0e8719dc837d Mon Sep 17 00:00:00 2001
From: JasonZhu1313 <jasonchu13@outlook.com>
Date: Thu, 22 Aug 2024 09:48:34 -0700
Subject: [PATCH 20/21] add fix-copies

---
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 9f8e3cd19cd835..2477590a2f1fcc 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -717,9 +717,11 @@ def bigbird_block_sparse_attention(
             attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[
                 :, :, :, :to_block_size
             ]  # 1st key block (global)
-            attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = (
-                second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size]
-            )  # last three blocks (global + sliding)
+            attention_probs[
+                :, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :
+            ] = second_last_attn_weights[
+                :, :, :, to_block_size : 4 * to_block_size
+            ]  # last three blocks (global + sliding)
             # random keys
             for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights):
                 # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch

From eaf602b497558de5bf1d196f461a3dd52975905f Mon Sep 17 00:00:00 2001
From: Steven Shimizu <sshimizu@linkedin.com>
Date: Thu, 22 Aug 2024 23:31:30 +0000
Subject: [PATCH 21/21] Fixed copy inconsistencies

---
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py    | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 2477590a2f1fcc..9f8e3cd19cd835 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -717,11 +717,9 @@ def bigbird_block_sparse_attention(
             attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[
                 :, :, :, :to_block_size
             ]  # 1st key block (global)
-            attention_probs[
-                :, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :
-            ] = second_last_attn_weights[
-                :, :, :, to_block_size : 4 * to_block_size
-            ]  # last three blocks (global + sliding)
+            attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = (
+                second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size]
+            )  # last three blocks (global + sliding)
             # random keys
             for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights):
                 # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch