diff --git a/docs/source/usage_guides/quantization.md b/docs/source/usage_guides/quantization.md index 872f738f8ac..7583b3f2adc 100644 --- a/docs/source/usage_guides/quantization.md +++ b/docs/source/usage_guides/quantization.md @@ -102,6 +102,27 @@ quantized_model_from_saved = load_and_quantize_model(empty_model, weights_locati Note that 4-bit model serialization is currently not supported. +### Offload modules to cpu and disk + +You can offload some modules to cpu/disk if you don't have enough space on the GPU to store the entire model on your GPUs. +This uses big model inference under the hood. Check this [documentation](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) for more details. + +For 8-bit quantization, the selected modules will be converted to 8-bit precision. + +For 4-bit quantization, the selected modules will be kept in `torch_dtype` that the user passed in `BnbQuantizationConfig`. We will add support to convert these offloaded modules in 4-bit when 4-bit serialization will be possible. + + You just need to pass a custom `device_map` in order to offload modules on cpu/disk. The offload modules will be dispatched on the GPU when needed. Here's an example : + +```py +device_map = { + "transformer.wte": 0, + "transformer.wpe": 0, + "transformer.drop": 0, + "transformer.h": "cpu", + "transformer.ln_f": "disk", + "lm_head": "disk", +} +``` ### Fine-tune a quantized model With the official support of adapters in the Hugging Face ecosystem, you can fine-tune quantized models. Please have a look at [peft](https://github.com/huggingface/peft) library for more details. diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py index bdb35a1632f..6aef7fb580d 100644 --- a/src/accelerate/hooks.py +++ b/src/accelerate/hooks.py @@ -279,7 +279,13 @@ def pre_forward(self, module, *args, **kwargs): for name, _ in named_module_tensors( module, include_buffers=self.offload_buffers, recurse=self.place_submodules ): - set_module_tensor_to_device(module, name, self.execution_device, value=self.weights_map[name]) + fp16_statistics = None + if "weight" in name and name.replace("weight", "SCB") in self.weights_map.keys(): + if self.weights_map[name].dtype == torch.int8: + fp16_statistics = self.weights_map[name.replace("weight", "SCB")] + set_module_tensor_to_device( + module, name, self.execution_device, value=self.weights_map[name], fp16_statistics=fp16_statistics + ) return send_to_device(args, self.execution_device), send_to_device( kwargs, self.execution_device, skip_keys=self.skip_keys @@ -291,6 +297,9 @@ def post_forward(self, module, output): module, include_buffers=self.offload_buffers, recurse=self.place_submodules ): set_module_tensor_to_device(module, name, "meta") + if type(module).__name__ == "Linear8bitLt": + module.state.SCB = None + module.state.CxB = None if self.io_same_device and self.input_device is not None: output = send_to_device(output, self.input_device, skip_keys=self.skip_keys) diff --git a/src/accelerate/utils/bnb.py b/src/accelerate/utils/bnb.py index 0ed4af56d40..073a553b11e 100644 --- a/src/accelerate/utils/bnb.py +++ b/src/accelerate/utils/bnb.py @@ -28,7 +28,14 @@ from ..big_modeling import dispatch_model, init_empty_weights from .dataclasses import BnbQuantizationConfig -from .modeling import find_tied_parameters, get_balanced_memory, infer_auto_device_map, load_checkpoint_in_model +from .modeling import ( + find_tied_parameters, + get_balanced_memory, + infer_auto_device_map, + load_checkpoint_in_model, + offload_weight, + set_module_tensor_to_device, +) if is_bnb_available(): @@ -98,24 +105,20 @@ def load_and_quantize_model( ) modules_on_cpu = [] + # custom device map if isinstance(device_map, dict) and len(device_map.keys()) > 1: modules_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]] - if len(modules_on_cpu) > 0 and not bnb_quantization_config.enable_fp32_cpu_offload: - raise ValueError( - "If you want to offload some keys to `cpu` or `disk`, you need to set " - " `enable_fp32_cpu_offload=True`. Note that these modules will not be " - " converted to 8-bit but kept in 32-bit." - ) # We keep some modules such as the lm_head in their original dtype for numerical stability reasons if bnb_quantization_config.skip_modules is None: bnb_quantization_config.skip_modules = get_keys_to_not_convert(model) - # add cpu modules to skip modules (after looking into the code on transformers, we don't really keep the cpu module in fp32) - bnb_quantization_config.skip_modules.extend(modules_on_cpu) + # add cpu modules to skip modules only for 4-bit modules + if load_in_4bit: + bnb_quantization_config.skip_modules.extend(modules_on_cpu) modules_to_not_convert = bnb_quantization_config.skip_modules - # We add the modules we want to keep in full precision + # We add the modules we want to keep in full precision if bnb_quantization_config.keep_in_fp32_modules is None: bnb_quantization_config.keep_in_fp32_modules = [] keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules @@ -176,6 +179,8 @@ def load_and_quantize_model( if offload_state_dict is None and device_map is not None and "disk" in device_map.values(): offload_state_dict = True + offload = any(x in list(device_map.values()) for x in ["cpu", "disk"]) + load_checkpoint_in_model( model, weights_location, @@ -184,6 +189,7 @@ def load_and_quantize_model( offload_folder=offload_folder, offload_state_dict=offload_state_dict, keep_in_fp32_modules=bnb_quantization_config.keep_in_fp32_modules, + offload_8bit_bnb=load_in_8bit and offload, ) return dispatch_model(model, device_map=device_map, offload_dir=offload_folder) @@ -247,18 +253,22 @@ def get_quantized_model_device_map( } for device in ["cpu", "disk"]: if device in device_map_without_some_modules.values(): - raise ValueError( - """ - Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the - quantized model. If you want to dispatch the model on the CPU or the disk while keeping these - modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom - `device_map` to `from_pretrained`. Check - https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu - for more details. - """ - ) + if bnb_quantization_config.load_in_4bit: + raise ValueError( + """ + Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit + the quantized model. If you want to dispatch the model on the CPU or the disk while keeping + these modules in `torch_dtype`, you need to pass a custom `device_map` to + `load_and_quantize_model`. Check + https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization#offload-modules-to-cpu-and-disk + for more details. + """ + ) + else: + logger.info( + "Some modules are are offloaded to the CPU or the disk. Note that these modules will be converted to 8-bit" + ) del device_map_without_some_modules - return device_map @@ -348,9 +358,10 @@ def _replace_with_bnb_layers( setattr(model, name, bnb_module) has_been_replaced = True if len(list(module.children())) > 0: - _, has_been_replaced = _replace_with_bnb_layers( + _, _has_been_replaced = _replace_with_bnb_layers( module, bnb_quantization_config, modules_to_not_convert, current_key_name ) + has_been_replaced = has_been_replaced | _has_been_replaced # Remove the last key for recursion current_key_name.pop(-1) return model, has_been_replaced @@ -418,3 +429,34 @@ def has_4bit_bnb_layers(model): def get_parameter_device(parameter: nn.Module): return next(parameter.parameters()).device + + +def quantize_and_offload_8bit(model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics): + # if it is not quantized, we quantize and offload the quantized weights and the SCB stats + if fp16_statistics is None: + set_module_tensor_to_device(model, param_name, 0, dtype=new_dtype, value=param) + tensor_name = param_name + module = model + if "." in tensor_name: + splits = tensor_name.split(".") + for split in splits[:-1]: + new_module = getattr(module, split) + if new_module is None: + raise ValueError(f"{module} has no attribute {split}.") + module = new_module + tensor_name = splits[-1] + # offload weights + module._parameters[tensor_name].requires_grad = False + offload_weight(module._parameters[tensor_name], param_name, offload_folder, index=offload_index) + if hasattr(module._parameters[tensor_name], "SCB"): + offload_weight( + module._parameters[tensor_name].SCB, + param_name.replace("weight", "SCB"), + offload_folder, + index=offload_index, + ) + else: + offload_weight(param, param_name, offload_folder, index=offload_index) + offload_weight(fp16_statistics, param_name.replace("weight", "SCB"), offload_folder, index=offload_index) + + set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype, value=torch.empty(*param.size())) diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py index 4da3ad0b56e..cb486c92b28 100644 --- a/src/accelerate/utils/dataclasses.py +++ b/src/accelerate/utils/dataclasses.py @@ -1378,17 +1378,6 @@ class BnbQuantizationConfig: metadata={"help": "an explicit list of the modules that we don't quantize. We keep them in `torch.float32`."}, ) - # we will see if it will be useful - enable_fp32_cpu_offload: bool = field( - default=False, - metadata={ - "help": """ this flag is used for advanced use cases and users that are aware of this feature. If you want to split - your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use - this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8 - operations will not be run on CPU.""" - }, - ) - def __post_init__(self): """ Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. @@ -1408,9 +1397,6 @@ def __post_init__(self): if not isinstance(self.llm_int8_threshold, (int, float)): raise ValueError("llm_int8_threshold must be a float or an int") - if not isinstance(self.enable_fp32_cpu_offload, bool): - raise ValueError("enable_fp32_cpu_offload must be a boolean") - if not isinstance(self.bnb_4bit_quant_type, str): raise ValueError("bnb_4bit_quant_type must be a string") elif self.bnb_4bit_quant_type not in ["fp4", "nf4"]: @@ -1439,9 +1425,6 @@ def __post_init__(self): if self.keep_in_fp32_modules is not None and not isinstance(self.keep_in_fp32_modules, list): raise ValueError("keep_in_fp_32_modules must be a list of strings") - if not isinstance(self.enable_fp32_cpu_offload, bool): - raise ValueError("enable_fp32_cpu_offload must be a boolean") - if self.load_in_4bit: self.target_dtype = CustomDtype.INT4 diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index 175f2881173..c5095ed4286 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -279,7 +279,13 @@ def set_module_tensor_to_device( device_quantization = None with torch.no_grad(): # leave it on cpu first before moving them to cuda - if param is not None and param.device.type != "cuda" and param_cls.__name__ in ["Int8Params", "FP4Params"]: + # # fix the case where the device is meta, we don't want to put it on cpu because there is no data =0 + if ( + param is not None + and param.device.type != "cuda" + and torch.device(device).type == "cuda" + and param_cls.__name__ in ["Int8Params", "FP4Params"] + ): device_quantization = device device = "cpu" if value is None: @@ -303,15 +309,25 @@ def set_module_tensor_to_device( if param_cls.__name__ == "Int8Params" and new_value.dtype == torch.float32: # downcast to fp16 if any - needed for 8bit serialization new_value = new_value.to(torch.float16) - new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(device) + # quantize module that are going to stay on the cpu so that we offload quantized weights + if device == "cpu" and param_cls.__name__ == "Int8Params": + new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(0).to("cpu") + new_value.CB = new_value.CB.to("cpu") + new_value.SCB = new_value.SCB.to("cpu") + else: + new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(device) else: new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device) module._parameters[tensor_name] = new_value - if fp16_statistics is not None: - setattr(module.weight, "SCB", fp16_statistics.to(device)) - - if module.__class__.__name__ == "Linear8bitLt" and getattr(module.weight, "SCB", None) is None: + setattr(module._parameters[tensor_name], "SCB", fp16_statistics.to(device)) + del fp16_statistics + # as we put the weight to meta, it doesn't have SCB attr anymore. make sure that it is not a meta weight + if ( + module.__class__.__name__ == "Linear8bitLt" + and getattr(module.weight, "SCB", None) is None + and str(module.weight.device) != "meta" + ): # quantize only if necessary device_index = torch.device(device).index if torch.device(device).type == "cuda" else None if not getattr(module.weight, "SCB", None) and device_index is not None: @@ -326,6 +342,8 @@ def set_module_tensor_to_device( device_index = torch.device(device).index if torch.device(device).type == "cuda" else None if not getattr(module.weight, "quant_state", None) and device_index is not None: module.weight = module.weight.cuda(device_index) + # clean pre and post foward hook + torch.cuda.empty_cache() def named_module_tensors(module: nn.Module, include_buffers: bool = True, recurse: bool = False): @@ -660,11 +678,18 @@ def load_offloaded_weights(model, index, offload_folder): if index is None or len(index) == 0: # Nothing to do return - for param_name, metadata in index.items(): + if "SCB" in param_name: + continue + fp16_statistics = None + if "weight" in param_name and param_name.replace("weight", "SCB") in index.keys(): + weight_name = param_name.replace("weight", "SCB") + fp16_statistics = load_offloaded_weight( + os.path.join(offload_folder, f"{weight_name}.dat"), index[weight_name] + ) tensor_file = os.path.join(offload_folder, f"{param_name}.dat") weight = load_offloaded_weight(tensor_file, metadata) - set_module_tensor_to_device(model, param_name, "cpu", value=weight) + set_module_tensor_to_device(model, param_name, "cpu", value=weight, fp16_statistics=fp16_statistics) def get_balanced_memory( @@ -1137,6 +1162,7 @@ def load_checkpoint_in_model( offload_state_dict: bool = False, offload_buffers: bool = False, keep_in_fp32_modules: List[str] = None, + offload_8bit_bnb: bool = False, ): """ Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are @@ -1171,8 +1197,13 @@ def load_checkpoint_in_model( Whether or not to include the buffers in the weights offloaded to disk. keep_in_fp32_modules(`List[str]`, *optional*): A list of the modules that we keep in `torch.float32` dtype. + offload_8bit_bnb (`bool`, *optional*): + Whether or not to enable offload of 8-bit modules on cpu/disk. """ + if offload_8bit_bnb: + from .bnb import quantize_and_offload_8bit + tied_params = find_tied_parameters(model) if check_tied_parameters_in_config(model) and len(tied_params) == 0: @@ -1239,6 +1270,10 @@ def load_checkpoint_in_model( model.load_state_dict(checkpoint, strict=False) else: for param_name, param in checkpoint.items(): + # skip SCB parameter (for 8-bit serialization) + if "SCB" in param_name: + continue + module_name = param_name while len(module_name) > 0 and module_name not in device_map: @@ -1268,23 +1303,33 @@ def load_checkpoint_in_model( if offload_buffers or param_name not in buffer_names: if new_dtype is None: new_dtype = param.dtype - set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype) - offload_weight(param, param_name, offload_folder, index=offload_index) + if offload_8bit_bnb: + quantize_and_offload_8bit( + model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics + ) + continue + else: + set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype) + offload_weight(param, param_name, offload_folder, index=offload_index) elif param_device == "cpu" and offload_state_dict: if new_dtype is None: new_dtype = param.dtype - set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype) - offload_weight(param, param_name, state_dict_folder, index=state_dict_index) - else: - if "SCB" not in param_name: - set_module_tensor_to_device( - model, - param_name, - param_device, - value=param, - dtype=new_dtype, - fp16_statistics=fp16_statistics, + if offload_8bit_bnb: + quantize_and_offload_8bit( + model, param, param_name, new_dtype, state_dict_folder, state_dict_index, fp16_statistics ) + else: + set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype) + offload_weight(param, param_name, state_dict_folder, index=state_dict_index) + else: + set_module_tensor_to_device( + model, + param_name, + param_device, + value=param, + dtype=new_dtype, + fp16_statistics=fp16_statistics, + ) # Force Python to clean up. del checkpoint diff --git a/tests/test_quantization.py b/tests/test_quantization.py index 6b19a46cc88..83943baa84d 100644 --- a/tests/test_quantization.py +++ b/tests/test_quantization.py @@ -191,19 +191,20 @@ def test_fp32_8bit_conversion(self): self.assertTrue(model.lm_head.weight.dtype == torch.float32) @require_multi_gpu - def test_cpu_gpu_loading_random_device_map(self): + def test_cpu_gpu_loading_custom_device_map(self): + from bitsandbytes.nn import Int8Params from transformers import AutoConfig, AutoModelForCausalLM r""" - A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`. + A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`. """ device_map = { "transformer.word_embeddings": "cpu", "transformer.word_embeddings_layernorm": 0, "lm_head": "cpu", - "transformer.h.0": 0, - "transformer.h.1": 0, - "transformer.h.2": 0, + "transformer.h.0": "cpu", + "transformer.h.1": "cpu", + "transformer.h.2": "cpu", "transformer.h.3": 0, "transformer.h.4": 0, "transformer.h.5": 0, @@ -227,8 +228,7 @@ def test_cpu_gpu_loading_random_device_map(self): "transformer.h.23": 0, "transformer.ln_f": 1, } - - bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, enable_fp32_cpu_offload=True) + bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True) with init_empty_weights(): model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name)) @@ -241,24 +241,50 @@ def test_cpu_gpu_loading_random_device_map(self): device_map=device_map, no_split_module_classes=["BloomBlock"], ) + self.assertTrue(model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params) + self.assertTrue(model_8bit.transformer.h[1].mlp.dense_4h_to_h.weight.__class__ == Int8Params) self.check_inference_correctness(model_8bit) @require_multi_gpu - def test_cpu_gpu_loading_custom_device_map(self): + def test_cpu_gpu_loading_custom_device_map_offload_state_dict(self): + from bitsandbytes.nn import Int8Params from transformers import AutoConfig, AutoModelForCausalLM r""" - A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`. + A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map` and offload_state_dict=True. """ device_map = { "transformer.word_embeddings": "cpu", - "transformer.word_embeddings_layernorm": "cpu", + "transformer.word_embeddings_layernorm": 0, "lm_head": "cpu", - "transformer.h": 0, + "transformer.h.0": "cpu", + "transformer.h.1": "cpu", + "transformer.h.2": "cpu", + "transformer.h.3": 0, + "transformer.h.4": 0, + "transformer.h.5": 0, + "transformer.h.6": 0, + "transformer.h.7": 0, + "transformer.h.8": 0, + "transformer.h.9": 1, + "transformer.h.10": 0, + "transformer.h.11": 1, + "transformer.h.12": 0, + "transformer.h.13": 0, + "transformer.h.14": 1, + "transformer.h.15": 0, + "transformer.h.16": 0, + "transformer.h.17": 1, + "transformer.h.18": 1, + "transformer.h.19": 0, + "transformer.h.20": 1, + "transformer.h.21": 1, + "transformer.h.22": 0, + "transformer.h.23": 0, "transformer.ln_f": 1, } - bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, enable_fp32_cpu_offload=True) + bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True) with init_empty_weights(): model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name)) @@ -270,11 +296,15 @@ def test_cpu_gpu_loading_custom_device_map(self): weights_location=self.weights_location, device_map=device_map, no_split_module_classes=["BloomBlock"], + offload_state_dict=True, ) + self.assertTrue(model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params) + self.assertTrue(model_8bit.transformer.h[1].mlp.dense_4h_to_h.weight.__class__ == Int8Params) self.check_inference_correctness(model_8bit) @require_multi_gpu def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self): + from bitsandbytes.nn import Int8Params from transformers import AutoConfig, AutoModelForCausalLM r""" @@ -282,13 +312,36 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self): This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config """ device_map = { - "transformer.word_embeddings": 0, - "transformer.word_embeddings_layernorm": "cpu", - "lm_head": 0, - "transformer.h": 1, - "transformer.ln_f": "cpu", + "transformer.word_embeddings": "cpu", + "transformer.word_embeddings_layernorm": 0, + "lm_head": "cpu", + "transformer.h.0": "cpu", + "transformer.h.1": "cpu", + "transformer.h.2": "cpu", + "transformer.h.3": "disk", + "transformer.h.4": "disk", + "transformer.h.5": "disk", + "transformer.h.6": 0, + "transformer.h.7": 0, + "transformer.h.8": 0, + "transformer.h.9": 1, + "transformer.h.10": 0, + "transformer.h.11": 1, + "transformer.h.12": 0, + "transformer.h.13": 0, + "transformer.h.14": 1, + "transformer.h.15": 0, + "transformer.h.16": 0, + "transformer.h.17": 1, + "transformer.h.18": 1, + "transformer.h.19": 0, + "transformer.h.20": 1, + "transformer.h.21": 1, + "transformer.h.22": 0, + "transformer.h.23": 0, + "transformer.ln_f": 1, } - bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, enable_fp32_cpu_offload=True) + bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True) with init_empty_weights(): model_8bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name)) @@ -304,6 +357,8 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self): offload_folder=tmpdirname, offload_state_dict=True, ) + self.assertTrue(model_8bit.transformer.h[4].mlp.dense_4h_to_h.weight.__class__ == Int8Params) + self.assertTrue(model_8bit.transformer.h[5].mlp.dense_4h_to_h.weight.__class__ == Int8Params) self.check_inference_correctness(model_8bit) def test_int8_serialization(self): @@ -338,6 +393,67 @@ def test_int8_serialization(self): self.check_inference_correctness(model_8bit_from_saved) + def test_int8_serialization_offload(self): + r""" + Test whether it is possible to serialize a model in 8-bit and offload weights to cpu/disk + """ + + from bitsandbytes.nn import Int8Params + from transformers import AutoConfig, AutoModelForCausalLM + + with tempfile.TemporaryDirectory() as tmpdirname: + # saving state dict for now but will save config and other in the future + self.accelerate.save_model(self.model_8bit, tmpdirname) + + with init_empty_weights(): + # let's suppose that we can get the right config + model_8bit_from_saved = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name)) + model_8bit_from_saved.tie_weights() + bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True) + device_map = { + "transformer.word_embeddings": "cpu", + "transformer.word_embeddings_layernorm": 0, + "lm_head": "cpu", + "transformer.h.0": "cpu", + "transformer.h.1": "cpu", + "transformer.h.2": "cpu", + "transformer.h.3": "disk", + "transformer.h.4": "disk", + "transformer.h.5": "disk", + "transformer.h.6": 0, + "transformer.h.7": 0, + "transformer.h.8": 0, + "transformer.h.9": 1, + "transformer.h.10": 0, + "transformer.h.11": 1, + "transformer.h.12": 0, + "transformer.h.13": 0, + "transformer.h.14": 1, + "transformer.h.15": 0, + "transformer.h.16": 0, + "transformer.h.17": 1, + "transformer.h.18": 1, + "transformer.h.19": 0, + "transformer.h.20": 1, + "transformer.h.21": 1, + "transformer.h.22": 0, + "transformer.h.23": 0, + "transformer.ln_f": 1, + } + model_8bit_from_saved = load_and_quantize_model( + model_8bit_from_saved, + bnb_quantization_config, + weights_location=tmpdirname + "/pytorch_model.bin", + device_map=device_map, + no_split_module_classes=["BloomBlock"], + offload_folder=tmpdirname + "/tmp", + offload_state_dict=True, + ) + + self.assertTrue(model_8bit_from_saved.transformer.h[4].mlp.dense_4h_to_h.weight.__class__ == Int8Params) + self.assertTrue(model_8bit_from_saved.transformer.h[5].mlp.dense_4h_to_h.weight.__class__ == Int8Params) + self.check_inference_correctness(model_8bit_from_saved) + def test_int8_serialization_shard(self): r""" Test whether it is possible to serialize a model in 8-bit. @@ -647,7 +763,7 @@ def test_cpu_gpu_loading_random_device_map(self): "transformer.ln_f": 1, } - bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, enable_fp32_cpu_offload=True) + bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True) with init_empty_weights(): model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name)) @@ -677,7 +793,7 @@ def test_cpu_gpu_loading_custom_device_map(self): "transformer.ln_f": 1, } - bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, enable_fp32_cpu_offload=True) + bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True) with init_empty_weights(): model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name)) @@ -707,7 +823,7 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self): "transformer.h": 1, "transformer.ln_f": "cpu", } - bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True, enable_fp32_cpu_offload=True) + bnb_quantization_config = BnbQuantizationConfig(load_in_4bit=True) with init_empty_weights(): model_4bit = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name))