huggingface · pacman100 · Aug 29, 2023 · Aug 25, 2023 · Aug 25, 2023 · Aug 29, 2023
diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
@@ -456,6 +456,10 @@ as the model saving with FSDP activated is only available with recent fixes.
     If `"True"`, FSDP explicitly prefetches the next upcoming all-gather while executing in the forward pass. 
   - `limit_all_gathers` can be specified in the config file. 
     If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight all-gathers.
+  - `activation_checkpointing` can be specified in the config file.
+    If `"True"`, FSDP activation checkpointing is a technique to reduce memory usage by clearing activations of 
+    certain layers and recomputing them during a backward pass. Effectively, this trades extra computation time 
+    for reduced memory usage.
 
 **Few caveats to be aware of**
 - it is incompatible with `generate`, thus is incompatible with `--predict_with_generate` 

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -471,6 +471,10 @@ def __init__(
             if self.args.fsdp_config.get("limit_all_gathers", False):
                 self.limit_all_gathers = True
 
+            self.activation_checkpointing = False
+            if self.args.fsdp_config.get("activation_checkpointing", False):
+                self.activation_checkpointing = True
+
         # one place to sort out whether to place the model on device or not
         # postpone switching model to cuda when:
         # 1. MP - since we are trying to fit a much bigger than 1 gpu model
@@ -3896,6 +3900,9 @@ def create_accelerator_and_postprocess(self):
             fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get(
                 "limit_all_gathers", fsdp_plugin.limit_all_gathers
             )
+            fsdp_plugin.activation_checkpointing = self.args.fsdp_config.get(
+                "activation_checkpointing", fsdp_plugin.activation_checkpointing
+            )
 
         if self.is_deepspeed_enabled:
             if getattr(self.args, "hf_deepspeed_config", None) is None:

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -482,6 +482,10 @@ class TrainingArguments:
                     Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be
                     used when the xla flag is set to true, and an auto wrapping policy is specified through
                     fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
+                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
+                    If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
+                    certain layers and recomputing them during a backward pass. Effectively, this trades extra computation time
+                    for reduced memory usage.
 
         deepspeed (`str` or `dict`, *optional*):
             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may