huggingface · fabianlim · Oct 11, 2024 · Oct 11, 2024
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -75,6 +75,7 @@
     convert_model,
     convert_outputs_to_fp32,
     extract_model_from_parallel,
+    ensure_weights_retied,
     gather,
     gather_object,
     get_grad_scaler,
@@ -1472,6 +1473,13 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 if not is_type_fsdp:
                     self.state.fsdp_plugin.set_auto_wrap_policy(model)
                     fsdp_plugin = self.state.fsdp_plugin
+
+                    # need to ensure that params are re-tied after running
+                    # param_init_fn
+                    fsdp_plugin.param_init_fn = ensure_weights_retied(
+                        fsdp_plugin.param_init_fn, model, self.device, 
+                    )
+
                     kwargs = {
                         "sharding_strategy": fsdp_plugin.sharding_strategy,
                         "cpu_offload": fsdp_plugin.cpu_offload,

diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
@@ -205,6 +205,7 @@
     merge_fsdp_weights,
     save_fsdp_model,
     save_fsdp_optimizer,
+    ensure_weights_retied,
 )
 from .launch import (
     PrepareForLaunch,

diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -14,9 +14,11 @@
 import os
 import shutil
 from pathlib import Path
+from collections import defaultdict
 
 import torch
 
+
 from ..logging import get_logger
 from .constants import FSDP_MODEL_NAME, OPTIMIZER_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_NAME
 from .modeling import is_peft_model
@@ -324,3 +326,52 @@ def merge_fsdp_weights(
             logger.info(f"Removing old checkpoint directory {checkpoint_dir}")
             shutil.rmtree(checkpoint_dir)
     state.wait_for_everyone()
+
+def ensure_weights_retied(
+    param_init_fn, model: torch.nn.Module, device: torch.cuda.device
+):
+
+    _tied_names = model._tied_weights_keys
+    if not _tied_names:
+        # if no tied names just passthrough
+        return param_init_fn
+
+    # get map of parameter instances to params. 
+    # - needed for replacement later
+    _tied_params = {}
+    for name in _tied_names:
+        name = name.split('.')
+        name, param_name = '.'.join(name[:-1]), name[-1]
+        mod = model.get_submodule(name)
+        param = getattr(mod, param_name)
+
+        _tied_params[id(param)] = None # placeholder for the param first
+
+    # build param_init_fn for the case with tied params
+    def param_init_fn_tied_param(module: torch.nn.Module):
+
+        # track which params to tie 
+        # - usually only 1, but for completeness consider > 1
+        params_to_tie = defaultdict(list)
+        for n, param in module.named_parameters(recurse=False):
+            if id(param) in _tied_params:
+                params_to_tie[id(param)].append(n)
+
+        # call the param init fn, which potentially re-allocates the 
+        # parameters
+        module = param_init_fn(module)
+
+        # search the parameters again and tie them up again
+        for id_key, _param_names in params_to_tie.items():
+            for param_name in _param_names:
+                param = _tied_params[id_key]
+                if param is None:
+                    # everything will be tied to the first time the
+                    # param is observed
+                    _tied_params[id_key] = getattr(module, param_name)
+                else:
+                    setattr(module, param_name, param) # tie
+
+        return module
+
+    return param_init_fn_tied_param