diff --git a/docs/source/usage_guides/quantization.md b/docs/source/usage_guides/quantization.md
index 872f738f8ac..7583b3f2adc 100644
--- a/docs/source/usage_guides/quantization.md
+++ b/docs/source/usage_guides/quantization.md
@@ -102,6 +102,27 @@ quantized_model_from_saved = load_and_quantize_model(empty_model, weights_locati
 
 Note that 4-bit model serialization is currently not supported.
 
+### Offload modules to cpu and disk 
+
+You can offload some modules to cpu/disk if you don't have enough space on the GPU to store the entire model on your GPUs.
+This uses big model inference under the hood. Check this [documentation](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) for more details. 
+
+For 8-bit quantization, the selected modules will be converted to 8-bit precision. 
+
+For 4-bit quantization, the selected modules will be kept in `torch_dtype` that the user passed in `BnbQuantizationConfig`.  We will add support to convert these offloaded modules in 4-bit when 4-bit serialization will be possible. 
+
+ You just need to pass a custom `device_map` in order to offload modules on cpu/disk. The offload modules will be dispatched on the GPU when needed. Here's an example :
+
+```py
+device_map = {
+    "transformer.wte": 0,
+    "transformer.wpe": 0,
+    "transformer.drop": 0,
+    "transformer.h": "cpu",
+    "transformer.ln_f": "disk",
+    "lm_head": "disk",
+}
+```
 ### Fine-tune a quantized model
 
 With the official support of adapters in the Hugging Face ecosystem, you can fine-tune quantized models. Please have a look at [peft](https://github.com/huggingface/peft) library for more details.
diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py
index bdb35a1632f..6aef7fb580d 100644
--- a/src/accelerate/hooks.py
+++ b/src/accelerate/hooks.py
@@ -279,7 +279,13 @@ def pre_forward(self, module, *args, **kwargs):
             for name, _ in named_module_tensors(
                 module, include_buffers=self.offload_buffers, recurse=self.place_submodules
             ):
-                set_module_tensor_to_device(module, name, self.execution_device, value=self.weights_map[name])
+                fp16_statistics = None
+                if "weight" in name and name.replace("weight", "SCB") in self.weights_map.keys():
+                    if self.weights_map[name].dtype == torch.int8:
+                        fp16_statistics = self.weights_map[name.replace("weight", "SCB")]
+                set_module_tensor_to_device(
+                    module, name, self.execution_device, value=self.weights_map[name], fp16_statistics=fp16_statistics
+                )
 
         return send_to_device(args, self.execution_device), send_to_device(
             kwargs, self.execution_device, skip_keys=self.skip_keys
@@ -291,6 +297,9 @@ def post_forward(self, module, output):
                 module, include_buffers=self.offload_buffers, recurse=self.place_submodules
             ):
                 set_module_tensor_to_device(module, name, "meta")
+                if type(module).__name__ == "Linear8bitLt":
+                    module.state.SCB = None
+                    module.state.CxB = None
 
         if self.io_same_device and self.input_device is not None:
             output = send_to_device(output, self.input_device, skip_keys=self.skip_keys)
diff --git a/src/accelerate/utils/bnb.py b/src/accelerate/utils/bnb.py
index 0ed4af56d40..073a553b11e 100644
--- a/src/accelerate/utils/bnb.py
+++ b/src/accelerate/utils/bnb.py
@@ -28,7 +28,14 @@
 
 from ..big_modeling import dispatch_model, init_empty_weights
 from .dataclasses import BnbQuantizationConfig
-from .modeling import find_tied_parameters, get_balanced_memory, infer_auto_device_map, load_checkpoint_in_model
+from .modeling import (
+    find_tied_parameters,
+    get_balanced_memory,
+    infer_auto_device_map,
+    load_checkpoint_in_model,
+    offload_weight,
+    set_module_tensor_to_device,
+)
 
 
 if is_bnb_available():
@@ -98,24 +105,20 @@ def load_and_quantize_model(
         )
 
     modules_on_cpu = []
+    # custom device map
     if isinstance(device_map, dict) and len(device_map.keys()) > 1:
         modules_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
-        if len(modules_on_cpu) > 0 and not bnb_quantization_config.enable_fp32_cpu_offload:
-            raise ValueError(
-                "If you want to offload some keys to `cpu` or `disk`, you need to set "
-                " `enable_fp32_cpu_offload=True`. Note that these modules will not be "
-                " converted to 8-bit but kept in 32-bit."
-            )
 
     # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
     if bnb_quantization_config.skip_modules is None:
         bnb_quantization_config.skip_modules = get_keys_to_not_convert(model)
 
-    # add cpu modules to skip modules (after looking into the code on transformers, we don't really keep the cpu module in fp32)
-    bnb_quantization_config.skip_modules.extend(modules_on_cpu)
+    # add cpu modules to skip modules only for 4-bit modules
+    if load_in_4bit:
+        bnb_quantization_config.skip_modules.extend(modules_on_cpu)
     modules_to_not_convert = bnb_quantization_config.skip_modules
-    # We add the modules we want to keep in full precision
 
+    # We add the modules we want to keep in full precision
     if bnb_quantization_config.keep_in_fp32_modules is None:
         bnb_quantization_config.keep_in_fp32_modules = []
     keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules
@@ -176,6 +179,8 @@ def load_and_quantize_model(
         if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
             offload_state_dict = True
 
+        offload = any(x in list(device_map.values()) for x in ["cpu", "disk"])
+
         load_checkpoint_in_model(
             model,
             weights_location,
@@ -184,6 +189,7 @@ def load_and_quantize_model(
             offload_folder=offload_folder,
             offload_state_dict=offload_state_dict,
             keep_in_fp32_modules=bnb_quantization_config.keep_in_fp32_modules,
+            offload_8bit_bnb=load_in_8bit and offload,
         )
         return dispatch_model(model, device_map=device_map, offload_dir=offload_folder)
 
@@ -247,18 +253,22 @@ def get_quantized_model_device_map(
         }
         for device in ["cpu", "disk"]:
             if device in device_map_without_some_modules.values():
-                raise ValueError(
-                    """
-                    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
-                    quantized model. If you want to dispatch the model on the CPU or the disk while keeping these
-                    modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
-                    `device_map` to `from_pretrained`. Check
-                    https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
-                    for more details.
-                    """
-                )
+                if bnb_quantization_config.load_in_4bit:
+                    raise ValueError(
+                        """
+                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
+                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
+                        these modules in `torch_dtype`, you need to pass a custom `device_map` to
+                        `load_and_quantize_model`. Check
+                        https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization#offload-modules-to-cpu-and-disk
+                        for more details.
+                        """
+                    )
+                else:
+                    logger.info(
+                        "Some modules are are offloaded to the CPU or the disk. Note that these modules will be converted to 8-bit"
+                    )
         del device_map_without_some_modules
-
     return device_map
 
 
@@ -348,9 +358,10 @@ def _replace_with_bnb_layers(
                 setattr(model, name, bnb_module)
                 has_been_replaced = True
         if len(list(module.children())) > 0:
-            _, has_been_replaced = _replace_with_bnb_layers(
+            _, _has_been_replaced = _replace_with_bnb_layers(
                 module, bnb_quantization_config, modules_to_not_convert, current_key_name
             )
+            has_been_replaced = has_been_replaced | _has_been_replaced
         # Remove the last key for recursion
         current_key_name.pop(-1)
     return model, has_been_replaced
@@ -418,3 +429,34 @@ def has_4bit_bnb_layers(model):
 
 def get_parameter_device(parameter: nn.Module):
     return next(parameter.parameters()).device
+
+
+def quantize_and_offload_8bit(model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics):
+    # if it is not quantized, we quantize and offload the quantized weights and the SCB stats
+    if fp16_statistics is None:
+        set_module_tensor_to_device(model, param_name, 0, dtype=new_dtype, value=param)
+        tensor_name = param_name
+        module = model
+        if "." in tensor_name:
+            splits = tensor_name.split(".")
+            for split in splits[:-1]:
+                new_module = getattr(module, split)
+                if new_module is None:
+                    raise ValueError(f"{module} has no attribute {split}.")
+                module = new_module
+            tensor_name = splits[-1]
+        # offload weights
+        module._parameters[tensor_name].requires_grad = False
+        offload_weight(module._parameters[tensor_name], param_name, offload_folder, index=offload_index)
+        if hasattr(module._parameters[tensor_name], "SCB"):
+            offload_weight(
+                module._parameters[tensor_name].SCB,
+                param_name.replace("weight", "SCB"),
+                offload_folder,
+                index=offload_index,
+            )
+    else:
+        offload_weight(param, param_name, offload_folder, index=offload_index)
+        offload_weight(fp16_statistics, param_name.replace("weight", "SCB"), offload_folder, index=offload_index)
+
+    set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype, value=torch.empty(*param.size()))
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index 4da3ad0b56e..cb486c92b28 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -1378,17 +1378,6 @@ class BnbQuantizationConfig:
         metadata={"help": "an explicit list of the modules that we don't quantize. We keep them in `torch.float32`."},
     )
 
-    # we will see if it will be useful
-    enable_fp32_cpu_offload: bool = field(
-        default=False,
-        metadata={
-            "help": """ this flag is used for advanced use cases and users that are aware of this feature. If you want to split
-            your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
-            this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
-            operations will not be run on CPU."""
-        },
-    )
-
     def __post_init__(self):
         """
         Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
@@ -1408,9 +1397,6 @@ def __post_init__(self):
         if not isinstance(self.llm_int8_threshold, (int, float)):
             raise ValueError("llm_int8_threshold must be a float or an int")
 
-        if not isinstance(self.enable_fp32_cpu_offload, bool):
-            raise ValueError("enable_fp32_cpu_offload must be a boolean")
-
         if not isinstance(self.bnb_4bit_quant_type, str):
             raise ValueError("bnb_4bit_quant_type must be a string")
         elif self.bnb_4bit_quant_type not in ["fp4", "nf4"]:
@@ -1439,9 +1425,6 @@ def __post_init__(self):
         if self.keep_in_fp32_modules is not None and not isinstance(self.keep_in_fp32_modules, list):
             raise ValueError("keep_in_fp_32_modules must be a list of strings")
 
-        if not isinstance(self.enable_fp32_cpu_offload, bool):
-            raise ValueError("enable_fp32_cpu_offload must be a boolean")
-
         if self.load_in_4bit:
             self.target_dtype = CustomDtype.INT4
 
diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py
index 175f2881173..c5095ed4286 100644
--- a/src/accelerate/utils/modeling.py
+++ b/src/accelerate/utils/modeling.py
@@ -279,7 +279,13 @@ def set_module_tensor_to_device(
     device_quantization = None
     with torch.no_grad():
         # leave it on cpu first before moving them to cuda
-        if param is not None and param.device.type != "cuda" and param_cls.__name__ in ["Int8Params", "FP4Params"]:
+        # # fix the case where the device is meta, we don't want to put it on cpu because there is no data =0
+        if (
+            param is not None
+            and param.device.type != "cuda"
+            and torch.device(device).type == "cuda"
+            and param_cls.__name__ in ["Int8Params", "FP4Params"]
+        ):
             device_quantization = device
             device = "cpu"
         if value is None:
@@ -303,15 +309,25 @@ def set_module_tensor_to_device(
                 if param_cls.__name__ == "Int8Params" and new_value.dtype == torch.float32:
                     # downcast to fp16 if any - needed for 8bit serialization
                     new_value = new_value.to(torch.float16)
-                new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(device)
+                # quantize module that are going to stay on the cpu so that we offload quantized weights
+                if device == "cpu" and param_cls.__name__ == "Int8Params":
+                    new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(0).to("cpu")
+                    new_value.CB = new_value.CB.to("cpu")
+                    new_value.SCB = new_value.SCB.to("cpu")
+                else:
+                    new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(device)
             else:
                 new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
             module._parameters[tensor_name] = new_value
-
             if fp16_statistics is not None:
-                setattr(module.weight, "SCB", fp16_statistics.to(device))
-
-            if module.__class__.__name__ == "Linear8bitLt" and getattr(module.weight, "SCB", None) is None:
+                setattr(module._parameters[tensor_name], "SCB", fp16_statistics.to(device))
+                del fp16_statistics
+            # as we put the weight to meta, it doesn't have SCB attr anymore. make sure that it is not a meta weight
+            if (
+                module.__class__.__name__ == "Linear8bitLt"
+                and getattr(module.weight, "SCB", None) is None
+                and str(module.weight.device) != "meta"
+            ):
                 # quantize only if necessary
                 device_index = torch.device(device).index if torch.device(device).type == "cuda" else None
                 if not getattr(module.weight, "SCB", None) and device_index is not None:
@@ -326,6 +342,8 @@ def set_module_tensor_to_device(
                 device_index = torch.device(device).index if torch.device(device).type == "cuda" else None
                 if not getattr(module.weight, "quant_state", None) and device_index is not None:
                     module.weight = module.weight.cuda(device_index)
+    # clean pre and post foward hook
+    torch.cuda.empty_cache()
 
 
 def named_module_tensors(module: nn.Module, include_buffers: bool = True, recurse: bool = False):
@@ -660,11 +678,18 @@ def load_offloaded_weights(model, index, offload_folder):
     if index is None or len(index) == 0:
         # Nothing to do
         return
-
     for param_name, metadata in index.items():
+        if "SCB" in param_name:
+            continue
+        fp16_statistics = None
+        if "weight" in param_name and param_name.replace("weight", "SCB") in index.keys():
+            weight_name = param_name.replace("weight", "SCB")
+            fp16_statistics = load_offloaded_weight(
+                os.path.join(offload_folder, f"{weight_name}.dat"), index[weight_name]
+            )
         tensor_file = os.path.join(offload_folder, f"{param_name}.dat")
         weight = load_offloaded_weight(tensor_file, metadata)
-        set_module_tensor_to_device(model, param_name, "cpu", value=weight)
+        set_module_tensor_to_device(model, param_name, "cpu", value=weight, fp16_statistics=fp16_statistics)
 
 
 def get_balanced_memory(
@@ -1137,6 +1162,7 @@ def load_checkpoint_in_model(
     offload_state_dict: bool = False,
     offload_buffers: bool = False,
     keep_in_fp32_modules: List[str] = None,
+    offload_8bit_bnb: bool = False,
 ):
     """
     Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
@@ -1171,8 +1197,13 @@ def load_checkpoint_in_model(
             Whether or not to include the buffers in the weights offloaded to disk.
         keep_in_fp32_modules(`List[str]`, *optional*):
             A list of the modules that we keep in `torch.float32` dtype.
+        offload_8bit_bnb (`bool`, *optional*):
+            Whether or not to enable offload of 8-bit modules on cpu/disk.
 
     """
+    if offload_8bit_bnb:
+        from .bnb import quantize_and_offload_8bit
+
     tied_params = find_tied_parameters(model)
 
     if check_tied_parameters_in_config(model) and len(tied_params) == 0:
@@ -1239,6 +1270,10 @@ def load_checkpoint_in_model(
             model.load_state_dict(checkpoint, strict=False)
         else:
             for param_name, param in checkpoint.items():
+                # skip SCB parameter (for 8-bit serialization)
+                if "SCB" in param_name:
+                    continue
+
                 module_name = param_name
 
                 while len(module_name) > 0 and module_name not in device_map:
@@ -1268,23 +1303,33 @@ def load_checkpoint_in_model(
                     if offload_buffers or param_name not in buffer_names:
                         if new_dtype is None:
                             new_dtype = param.dtype
-                        set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
-                    offload_weight(param, param_name, offload_folder, index=offload_index)
+                        if offload_8bit_bnb:
+                            quantize_and_offload_8bit(
+                                model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics
+                            )
+                            continue
+                        else:
+                            set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
+                        offload_weight(param, param_name, offload_folder, index=offload_index)
                 elif param_device == "cpu" and offload_state_dict:
                     if new_dtype is None:
                         new_dtype = param.dtype
-                    set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
-                    offload_weight(param, param_name, state_dict_folder, index=state_dict_index)
-                else:
-                    if "SCB" not in param_name:
-                        set_module_tensor_to_device(
-                            model,
-                            param_name,
-                            param_device,
-                            value=param,
-                            dtype=new_dtype,
-                            fp16_statistics=fp16_statistics,
+                    if offload_8bit_bnb:
+                        quantize_and_offload_8bit(
+                            model, param, param_name, new_dtype, state_dict_folder, state_dict_index, fp16_statistics
                         )
+                    else:
+                        set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
+                        offload_weight(param, param_name, state_dict_folder, index=state_dict_index)
+                else:
+                    set_module_tensor_to_device(
+                        model,
+                        param_name,
+                        param_device,
+                        value=param,
+                        dtype=new_dtype,
+                        fp16_statistics=fp16_statistics,
+                    )
 
         # Force Python to clean up.
         del checkpoint
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
index 6b19a46cc88..83943baa84d 100644
--- a/tests/test_quantization.py
+++ b/tests/test_quantization.py
@@ -191,19 +191,20 @@ def test_fp32_8bit_conversion(self):
         self.assertTrue(model.lm_head.weight.dtype == torch.float32)
 
     @require_multi_gpu
-    def test_cpu_gpu_loading_random_device_map(self):
+    def test_cpu_gpu_loading_custom_device_map(self):
+        from bitsandbytes.nn import Int8Params
         from transformers import AutoConfig, AutoModelForCausalLM
 
         r"""
-        A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
+        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
         """
         device_map = {
             "transformer.word_embeddings": "cpu",
             "transformer.word_embeddings_layernorm": 0,
             "lm_head": "cpu",
-            "transformer.h.0": 0,
-            "transformer.h.1": 0,
-            "transformer.h.2": 0,
+            "transformer.h.0": "cpu",
+            "transformer.h.1": "cpu",
+            "transformer.h.2": "cpu",
             "transformer.h.3": 0,
             "transformer.h.4": 0,
             "transformer.h.5": 0,
@@ -227,8 +228,7 @@ def test_cpu_gpu_loading_random_device_map(self):
             "transformer.h.23": 0,
             "transformer.ln_f": 1,
         }
-
-        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, enable_fp32_cpu_offload=True)
+        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
 
         with init_empty_weights():
             model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
@@ -241,24 +241,50 @@ def test_cpu_gpu_loading_random_device_map(self):
             device_map=device_map,
             no_split_module_classes=["BloomBlock"],
         )
+        self.assertTrue(model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+        self.assertTrue(model_8bit.transformer.h[1].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
         self.check_inference_correctness(model_8bit)
 
     @require_multi_gpu
-    def test_cpu_gpu_loading_custom_device_map(self):
+    def test_cpu_gpu_loading_custom_device_map_offload_state_dict(self):
+        from bitsandbytes.nn import Int8Params
         from transformers import AutoConfig, AutoModelForCausalLM
 
         r"""
-        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
+        A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map` and offload_state_dict=True.
         """
         device_map = {
             "transformer.word_embeddings": "cpu",
-            "transformer.word_embeddings_layernorm": "cpu",
+            "transformer.word_embeddings_layernorm": 0,
             "lm_head": "cpu",
-            "transformer.h": 0,
+            "transformer.h.0": "cpu",
+            "transformer.h.1": "cpu",
+            "transformer.h.2": "cpu",
+            "transformer.h.3": 0,
+            "transformer.h.4": 0,
+            "transformer.h.5": 0,
+            "transformer.h.6": 0,
+            "transformer.h.7": 0,
+            "transformer.h.8": 0,
+            "transformer.h.9": 1,
+            "transformer.h.10": 0,
+            "transformer.h.11": 1,
+            "transformer.h.12": 0,
+            "transformer.h.13": 0,
+            "transformer.h.14": 1,
+            "transformer.h.15": 0,
+            "transformer.h.16": 0,
+            "transformer.h.17": 1,
+            "transformer.h.18": 1,
+            "transformer.h.19": 0,
+            "transformer.h.20": 1,
+            "transformer.h.21": 1,
+            "transformer.h.22": 0,
+            "transformer.h.23": 0,
             "transformer.ln_f": 1,
         }
 
-        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, enable_fp32_cpu_offload=True)
+        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
 
         with init_empty_weights():
             model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
@@ -270,11 +296,15 @@ def test_cpu_gpu_loading_custom_device_map(self):
             weights_location=self.weights_location,
             device_map=device_map,
             no_split_module_classes=["BloomBlock"],
+            offload_state_dict=True,
         )
+        self.assertTrue(model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+        self.assertTrue(model_8bit.transformer.h[1].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
         self.check_inference_correctness(model_8bit)
 
     @require_multi_gpu
     def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
+        from bitsandbytes.nn import Int8Params
         from transformers import AutoConfig, AutoModelForCausalLM
 
         r"""
@@ -282,13 +312,36 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
         This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config
         """
         device_map = {
-            "transformer.word_embeddings": 0,
-            "transformer.word_embeddings_layernorm": "cpu",
-            "lm_head": 0,
-            "transformer.h": 1,
-            "transformer.ln_f": "cpu",
+            "transformer.word_embeddings": "cpu",
+            "transformer.word_embeddings_layernorm": 0,
+            "lm_head": "cpu",
+            "transformer.h.0": "cpu",
+            "transformer.h.1": "cpu",
+            "transformer.h.2": "cpu",
+            "transformer.h.3": "disk",
+            "transformer.h.4": "disk",
+            "transformer.h.5": "disk",
+            "transformer.h.6": 0,
+            "transformer.h.7": 0,
+            "transformer.h.8": 0,
+            "transformer.h.9": 1,
+            "transformer.h.10": 0,
+            "transformer.h.11": 1,
+            "transformer.h.12": 0,
+            "transformer.h.13": 0,
+            "transformer.h.14": 1,
+            "transformer.h.15": 0,
+            "transformer.h.16": 0,
+            "transformer.h.17": 1,
+            "transformer.h.18": 1,
+            "transformer.h.19": 0,
+            "transformer.h.20": 1,
+            "transformer.h.21": 1,
+            "transformer.h.22": 0,
+            "transformer.h.23": 0,
+            "transformer.ln_f": 1,
         }
-        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, enable_fp32_cpu_offload=True)
+        bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
 
         with init_empty_weights():
             model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
@@ -304,6 +357,8 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
                 offload_folder=tmpdirname,
                 offload_state_dict=True,
             )
+            self.assertTrue(model_8bit.transformer.h[4].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+            self.assertTrue(model_8bit.transformer.h[5].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
             self.check_inference_correctness(model_8bit)
 
     def test_int8_serialization(self):
@@ -338,6 +393,67 @@ def test_int8_serialization(self):
 
             self.check_inference_correctness(model_8bit_from_saved)
 
+    def test_int8_serialization_offload(self):
+        r"""
+        Test whether it is possible to serialize a model in 8-bit and offload weights to cpu/disk
+        """
+
+        from bitsandbytes.nn import Int8Params
+        from transformers import AutoConfig, AutoModelForCausalLM
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # saving state dict for now but will save config and other in the future
+            self.accelerate.save_model(self.model_8bit, tmpdirname)
+
+            with init_empty_weights():
+                # let's suppose that we can get the right config
+                model_8bit_from_saved = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
+            model_8bit_from_saved.tie_weights()
+            bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True)
+            device_map = {
+                "transformer.word_embeddings": "cpu",
+                "transformer.word_embeddings_layernorm": 0,
+                "lm_head": "cpu",
+                "transformer.h.0": "cpu",
+                "transformer.h.1": "cpu",
+                "transformer.h.2": "cpu",
+                "transformer.h.3": "disk",
+                "transformer.h.4": "disk",
+                "transformer.h.5": "disk",
+                "transformer.h.6": 0,
+                "transformer.h.7": 0,
+                "transformer.h.8": 0,
+                "transformer.h.9": 1,
+                "transformer.h.10": 0,
+                "transformer.h.11": 1,
+                "transformer.h.12": 0,
+                "transformer.h.13": 0,
+                "transformer.h.14": 1,
+                "transformer.h.15": 0,
+                "transformer.h.16": 0,
+                "transformer.h.17": 1,
+                "transformer.h.18": 1,
+                "transformer.h.19": 0,
+                "transformer.h.20": 1,
+                "transformer.h.21": 1,
+                "transformer.h.22": 0,
+                "transformer.h.23": 0,
+                "transformer.ln_f": 1,
+            }
+            model_8bit_from_saved = load_and_quantize_model(
+                model_8bit_from_saved,
+                bnb_quantization_config,
+                weights_location=tmpdirname + "/pytorch_model.bin",
+                device_map=device_map,
+                no_split_module_classes=["BloomBlock"],
+                offload_folder=tmpdirname + "/tmp",
+                offload_state_dict=True,
+            )
+
+            self.assertTrue(model_8bit_from_saved.transformer.h[4].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+            self.assertTrue(model_8bit_from_saved.transformer.h[5].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+            self.check_inference_correctness(model_8bit_from_saved)
+
     def test_int8_serialization_shard(self):
         r"""
         Test whether it is possible to serialize a model in 8-bit.
@@ -647,7 +763,7 @@ def test_cpu_gpu_loading_random_device_map(self):
             "transformer.ln_f": 1,
         }
 
-        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, enable_fp32_cpu_offload=True)
+        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)
 
         with init_empty_weights():
             model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
@@ -677,7 +793,7 @@ def test_cpu_gpu_loading_custom_device_map(self):
             "transformer.ln_f": 1,
         }
 
-        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, enable_fp32_cpu_offload=True)
+        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)
 
         with init_empty_weights():
             model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))
@@ -707,7 +823,7 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
             "transformer.h": 1,
             "transformer.ln_f": "cpu",
         }
-        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, enable_fp32_cpu_offload=True)
+        bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True)
 
         with init_empty_weights():
             model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))