From 1e28a0db20cc97bc05292f6d89b0bcd1f433a354 Mon Sep 17 00:00:00 2001 From: Tomasz Bawor Date: Mon, 23 Sep 2024 20:22:26 +0200 Subject: [PATCH 1/9] quantization_cpu base version --- thunder/tests/test_quantization.py | 73 ++++++++ thunder/transforms/quantization.py | 17 +- thunder/transforms/quantization_cpu.py | 222 +++++++++++++++++++++++++ 3 files changed, 309 insertions(+), 3 deletions(-) create mode 100644 thunder/tests/test_quantization.py create mode 100644 thunder/transforms/quantization_cpu.py diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py new file mode 100644 index 0000000000..832d9b0e1d --- /dev/null +++ b/thunder/tests/test_quantization.py @@ -0,0 +1,73 @@ +import torch +import time +from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit + +def test_cpu_quantization(): + # Initialize quantization transform + quant_transform = BitsAndBytesLinearQuant4bit() + + # Create a tensor on CPU + weight = torch.randn(3, 3, device='cpu') + + # Quantize weight (expect only the quantized tensor, not a tuple) + quantized_weight = quant_transform.quantize_weight(weight) + + # Check that the quantized tensor has fewer or equal elements due to compression + original_num_elements = weight.numel() + quantized_num_elements = quantized_weight.numel() + + assert quantized_weight is not None, "Quantized weight is None" + assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression" + +def test_gpu_quantization(): + if not torch.cuda.is_available(): + return + + # Initialize quantization transform + quant_transform = BitsAndBytesLinearQuant4bit() + + # Create a tensor on GPU + weight = torch.randn(3, 3, device='cuda') + + # Quantize weight (expect only the quantized tensor, not a tuple) + quantized_weight = quant_transform.quantize_weight(weight)[0] + + # Check that the quantized tensor has fewer or equal elements due to compression + original_num_elements = weight.numel() + quantized_num_elements = quantized_weight.numel() + + assert quantized_weight is not None, "Quantized weight is None" + assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression" + +# Optional: Performance tests +def measure_time(device_type): + quant_transform = BitsAndBytesLinearQuant4bit() + + if device_type == 'cuda' and torch.cuda.is_available(): + device = torch.device('cuda') + else: + device = torch.device('cpu') + + weight = torch.randn(1000, 1000, device=device) + + start_time = time.time() + quantized_weight = quant_transform.quantize_weight(weight) # Expect only the quantized tensor + end_time = time.time() + + print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds") + +# Run functional tests +print("Testing CPU quantization:") +test_cpu_quantization() + +if torch.cuda.is_available(): + print("\nTesting GPU quantization:") + test_gpu_quantization() +else: + print("\nGPU not available, skipping GPU test.") + +# Run performance tests +print("\nMeasuring performance:") +measure_time('cpu') +if torch.cuda.is_available(): + measure_time('cuda') diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py index f4eebfb752..31d0ab977e 100644 --- a/thunder/transforms/quantization.py +++ b/thunder/transforms/quantization.py @@ -14,6 +14,7 @@ add_trace_output, ) +from .quantization_cpu import quantize_4bit_impl bitsandbytes_executor = None @@ -91,10 +92,20 @@ def __init__(self): get_bitsandbytes_executor() def quantize_weight(self, w): - # todo: revisit staying on CPU when bnb supports it if w.device.type == "meta": - w_work = torch.zeros_like(w, device="cuda") - elif w.device.type != "cuda": + num_elements = w.numel() + return torch.empty((num_elements, 1), device="meta", dtype=torch.uint8) + + # CPU quantization without returning the quantization state. + # Currently, the quantization state is omitted for CPU as the primary goal is to optimize + # for inference. If the use case involves fine-tuning or dequantizing weights back to + # their original precision, it may be necessary to return the state. This can be revisited + # if future use cases require more flexibility, such as further model training or analysis + # of quantization effects on the CPU. + if w.device.type == "cpu": + return quantize_4bit_impl(w, quant_type="nf4")[0] + + if w.device.type != "cuda": with torch.no_grad(): w_work = w.to("cuda") else: diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py new file mode 100644 index 0000000000..a3a7d1883e --- /dev/null +++ b/thunder/transforms/quantization_cpu.py @@ -0,0 +1,222 @@ +# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the +# bitsandbytes library (https://github.com/bitsandbytes-foundation/bitsandbytes/tree/multi-backend-refactor). +# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation +# should be replaced with the official, upstream version to ensure better compatibility, performance, +# and future updates. +# Please track the progress of the bitsandbytes library and update this file when necessary. + +import warnings + +import torch + +from bitsandbytes.functional import ( + QuantState, + get_4bit_type, +) + +Tensor = torch.Tensor + +NF4_QUANT_TABLE = [ + -1.0 - 1e-2, # 0b0000 + -0.8480964004993439, # 0b0001 + -0.6106329262256622, # 0b0010 + -0.4599952697753906, # 0b0011 + -0.33967943489551544, # 0b0100 + -0.23460740596055984, # 0b0101 + -0.13791173323988914, # 0b0110 + -0.045525018125772476, # 0b0111 + 0.03979014977812767, # 0b1000 + 0.1202552504837513, # 0b1001 + 0.2035212516784668, # 0b1010 + 0.2920137718319893, # 0b1011 + 0.3893125355243683, # 0b1100 + 0.5016634166240692, # 0b1101 + 0.6427869200706482, # 0b1110 + 0.8614784181118011, # 0b1111 +] + +FP4_QUANT_TABLE = { + 0 - 1e-2: 0, # 0b0000 + 0.00260417: 1, # 0b0001 + 0.0859375: 6, # 0b0110 + 0.20833333: 7, # 0b0111 + 0.29166667: 4, # 0b0100 + 0.4166667: 5, # 0b0101 + 0.583333: 2, # 0b0010 + 0.8333333: 3, # 0b0011 +} + +def get_4bit_type(typename, device=None, blocksize=64): + if device is None: + device = "cuda" + data = None + if typename == "nf4": + """ Implements the NF4 data type. + + Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that + is normalized into the range [-1, 1]. + + For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314) + + Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in + the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. + """ + data = [ + -1.0, + -0.6961928009986877, + -0.5250730514526367, + -0.39491748809814453, + -0.28444138169288635, + -0.18477343022823334, + -0.09105003625154495, + 0.0, + 0.07958029955625534, + 0.16093020141124725, + 0.24611230194568634, + 0.33791524171829224, + 0.44070982933044434, + 0.5626170039176941, + 0.7229568362236023, + 1.0, + ] + elif typename == "fp4": + # 0b000 = 0 + # 0b001 = 0.0625 + # 0b010 = 8 + # 0b011 = 12 + # 0b100 = 4 + # 0b101 = 6 + # 0b110 = 2 + # 0b111 = 3 + # can also be created with bnb.functional.create_fp8_map(signed=True, exponent_bits=2, precision_bits=1, total_bits=4) + data = [0, 0.0625, 8.0, 12.0, 4.0, 6.0, 2.0, 3.0, -0, -0.0625, -8.0, -12.0, -4.0, -6.0, -2.0, -3.0] + elif typename == "int4": + data = [7, 6, 5, 4, 3, 2, 1, 0, -0, -1, -2, -3, -4, -5, -6, -7] + elif typename == "af4": + # Taken from: NF4 Isn't Information Theoretically Optimal (and that's Good) + # https://arxiv.org/abs/2306.06965 + if blocksize == 64: + data = [ + -1.0, + -0.69441008, + -0.51243739, + -0.3736951, + -0.25607552, + -0.14982478, + -0.04934812, + 0.0, + 0.04273164, + 0.12934483, + 0.21961274, + 0.31675666, + 0.42563882, + 0.55496234, + 0.72424863, + 1.0, + ][::-1] + else: + raise NotImplementedError("4-bit AbnormalFloats currently only support blocksize 64.") + + if data is None: + raise NotImplementedError(f"Typename {typename} not supported") + + data = torch.tensor(data, device=device) + data.div_(data.abs().max()) + + assert data.numel() == 16 + + return data + +def quantize_4bit_impl( + A: Tensor, + absmax: Tensor = None, + out: Tensor = None, + blocksize=64, + compress_statistics=False, + quant_type="nf4", +) -> Tensor: + """ + Quantize tensor A in blocks of 4-bit values. + + Quantizes tensor A by dividing it into blocks which are independently quantized to FP4. + + Parameters + ---------- + A : torch.Tensor + The input tensor. + absmax : torch.Tensor + The absmax values. + out : torch.Tensor + The output tensor (8-bit). + blocksize : int + The blocksize used in quantization. + quant_type : str + The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now + + Returns + ------- + torch.Tensor: + The 8-bit tensor with packed 4-bit values. + tuple(torch.Tensor, torch.Size, torch.dtype, int): + The quantization state to undo the quantization. + """ + if quant_type not in ["nf4", "fp4"]: + raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU.") + if quant_type == "fp4": + warnings.warn("fp4 quantization is currently slow on CPU/XPU. Please Use nf4 instead for better performance.") + assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] + n = A.numel() + input_shape = A.shape + blocks = n // blocksize + blocks += 1 if n % blocksize > 0 else 0 + + if absmax is None: + absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype) + + if out is None: + out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device) + + rem = n % blocksize + has_rem = rem > 0 + + # Scale tensor to [-1, 1] + A_reshaped = A.reshape(n) + A_com = A_reshaped[: n - rem] + A_com_reshaped = A_com.reshape(n // blocksize, blocksize) + absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0] + scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1) + scaled_A = scaled_A.reshape(-1) + if has_rem: + absmax[-1] = torch.abs(A_reshaped[n - rem :]).max() + scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1) + scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0) + # map [-1, 1] to nf4/fp4 + out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8) + if quant_type == "nf4": + for i in range(len(NF4_QUANT_TABLE)): + out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i + elif quant_type == "fp4": + sign = scaled_A < 0 + abs_scaled_A = torch.abs(scaled_A) + for key, val in FP4_QUANT_TABLE.items(): + out_uint8[abs_scaled_A > key] = val + out_uint8 += sign.to(torch.uint8) * 8 + if out_uint8.size(-1) % 2: + out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0) + out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2]) + + code = get_4bit_type(quant_type, device=A.device) + + if compress_statistics: + raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU") + else: + state = QuantState( + absmax=absmax, + shape=input_shape, + dtype=A.dtype, + blocksize=blocksize, + code=code, + quant_type=quant_type, + ) + + return out.unsqueeze(0), state From 688c3237b1f0a9e7a90b21b86c2da56f0c877eab Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 21:47:13 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- thunder/tests/test_quantization.py | 28 +++++++++++++++++--------- thunder/transforms/quantization.py | 10 ++++----- thunder/transforms/quantization_cpu.py | 20 +++++++++--------- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py index 832d9b0e1d..d94a5c76fd 100644 --- a/thunder/tests/test_quantization.py +++ b/thunder/tests/test_quantization.py @@ -2,12 +2,13 @@ import time from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit + def test_cpu_quantization(): # Initialize quantization transform quant_transform = BitsAndBytesLinearQuant4bit() # Create a tensor on CPU - weight = torch.randn(3, 3, device='cpu') + weight = torch.randn(3, 3, device="cpu") # Quantize weight (expect only the quantized tensor, not a tuple) quantized_weight = quant_transform.quantize_weight(weight) @@ -15,9 +16,12 @@ def test_cpu_quantization(): # Check that the quantized tensor has fewer or equal elements due to compression original_num_elements = weight.numel() quantized_num_elements = quantized_weight.numel() - + assert quantized_weight is not None, "Quantized weight is None" - assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression" + assert ( + quantized_num_elements <= original_num_elements + ), "Quantized tensor should have fewer or equal elements due to compression" + def test_gpu_quantization(): if not torch.cuda.is_available(): @@ -27,7 +31,7 @@ def test_gpu_quantization(): quant_transform = BitsAndBytesLinearQuant4bit() # Create a tensor on GPU - weight = torch.randn(3, 3, device='cuda') + weight = torch.randn(3, 3, device="cuda") # Quantize weight (expect only the quantized tensor, not a tuple) quantized_weight = quant_transform.quantize_weight(weight)[0] @@ -37,16 +41,19 @@ def test_gpu_quantization(): quantized_num_elements = quantized_weight.numel() assert quantized_weight is not None, "Quantized weight is None" - assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression" + assert ( + quantized_num_elements <= original_num_elements + ), "Quantized tensor should have fewer or equal elements due to compression" + # Optional: Performance tests def measure_time(device_type): quant_transform = BitsAndBytesLinearQuant4bit() - if device_type == 'cuda' and torch.cuda.is_available(): - device = torch.device('cuda') + if device_type == "cuda" and torch.cuda.is_available(): + device = torch.device("cuda") else: - device = torch.device('cpu') + device = torch.device("cpu") weight = torch.randn(1000, 1000, device=device) @@ -56,6 +63,7 @@ def measure_time(device_type): print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds") + # Run functional tests print("Testing CPU quantization:") test_cpu_quantization() @@ -68,6 +76,6 @@ def measure_time(device_type): # Run performance tests print("\nMeasuring performance:") -measure_time('cpu') +measure_time("cpu") if torch.cuda.is_available(): - measure_time('cuda') + measure_time("cuda") diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py index 31d0ab977e..c07508583f 100644 --- a/thunder/transforms/quantization.py +++ b/thunder/transforms/quantization.py @@ -95,16 +95,16 @@ def quantize_weight(self, w): if w.device.type == "meta": num_elements = w.numel() return torch.empty((num_elements, 1), device="meta", dtype=torch.uint8) - + # CPU quantization without returning the quantization state. - # Currently, the quantization state is omitted for CPU as the primary goal is to optimize - # for inference. If the use case involves fine-tuning or dequantizing weights back to + # Currently, the quantization state is omitted for CPU as the primary goal is to optimize + # for inference. If the use case involves fine-tuning or dequantizing weights back to # their original precision, it may be necessary to return the state. This can be revisited - # if future use cases require more flexibility, such as further model training or analysis + # if future use cases require more flexibility, such as further model training or analysis # of quantization effects on the CPU. if w.device.type == "cpu": return quantize_4bit_impl(w, quant_type="nf4")[0] - + if w.device.type != "cuda": with torch.no_grad(): w_work = w.to("cuda") diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py index a3a7d1883e..2d19d7454c 100644 --- a/thunder/transforms/quantization_cpu.py +++ b/thunder/transforms/quantization_cpu.py @@ -1,7 +1,7 @@ -# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the +# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the # bitsandbytes library (https://github.com/bitsandbytes-foundation/bitsandbytes/tree/multi-backend-refactor). -# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation -# should be replaced with the official, upstream version to ensure better compatibility, performance, +# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation +# should be replaced with the official, upstream version to ensure better compatibility, performance, # and future updates. # Please track the progress of the bitsandbytes library and update this file when necessary. @@ -46,20 +46,21 @@ 0.8333333: 3, # 0b0011 } + def get_4bit_type(typename, device=None, blocksize=64): if device is None: device = "cuda" data = None if typename == "nf4": - """ Implements the NF4 data type. + """Implements the NF4 data type. - Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that - is normalized into the range [-1, 1]. + Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that + is normalized into the range [-1, 1]. - For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314) + For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314) - Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in - the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. + Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in + the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. """ data = [ -1.0, @@ -127,6 +128,7 @@ def get_4bit_type(typename, device=None, blocksize=64): return data + def quantize_4bit_impl( A: Tensor, absmax: Tensor = None, From d3bcbf9275c0f8d8b6d7711607bca8bdba522d55 Mon Sep 17 00:00:00 2001 From: Tomasz Bawor Date: Tue, 8 Oct 2024 19:10:23 +0200 Subject: [PATCH 3/9] quantize_weight update for meta andd cpu --- thunder/tests/test_quantization.py | 73 ---------- thunder/transforms/quantization.py | 34 +++-- thunder/transforms/quantization_cpu.py | 181 +++++++++++++------------ 3 files changed, 116 insertions(+), 172 deletions(-) delete mode 100644 thunder/tests/test_quantization.py diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py deleted file mode 100644 index 832d9b0e1d..0000000000 --- a/thunder/tests/test_quantization.py +++ /dev/null @@ -1,73 +0,0 @@ -import torch -import time -from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit - -def test_cpu_quantization(): - # Initialize quantization transform - quant_transform = BitsAndBytesLinearQuant4bit() - - # Create a tensor on CPU - weight = torch.randn(3, 3, device='cpu') - - # Quantize weight (expect only the quantized tensor, not a tuple) - quantized_weight = quant_transform.quantize_weight(weight) - - # Check that the quantized tensor has fewer or equal elements due to compression - original_num_elements = weight.numel() - quantized_num_elements = quantized_weight.numel() - - assert quantized_weight is not None, "Quantized weight is None" - assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression" - -def test_gpu_quantization(): - if not torch.cuda.is_available(): - return - - # Initialize quantization transform - quant_transform = BitsAndBytesLinearQuant4bit() - - # Create a tensor on GPU - weight = torch.randn(3, 3, device='cuda') - - # Quantize weight (expect only the quantized tensor, not a tuple) - quantized_weight = quant_transform.quantize_weight(weight)[0] - - # Check that the quantized tensor has fewer or equal elements due to compression - original_num_elements = weight.numel() - quantized_num_elements = quantized_weight.numel() - - assert quantized_weight is not None, "Quantized weight is None" - assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression" - -# Optional: Performance tests -def measure_time(device_type): - quant_transform = BitsAndBytesLinearQuant4bit() - - if device_type == 'cuda' and torch.cuda.is_available(): - device = torch.device('cuda') - else: - device = torch.device('cpu') - - weight = torch.randn(1000, 1000, device=device) - - start_time = time.time() - quantized_weight = quant_transform.quantize_weight(weight) # Expect only the quantized tensor - end_time = time.time() - - print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds") - -# Run functional tests -print("Testing CPU quantization:") -test_cpu_quantization() - -if torch.cuda.is_available(): - print("\nTesting GPU quantization:") - test_gpu_quantization() -else: - print("\nGPU not available, skipping GPU test.") - -# Run performance tests -print("\nMeasuring performance:") -measure_time('cpu') -if torch.cuda.is_available(): - measure_time('cuda') diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py index 902d44d41e..ff9e502554 100644 --- a/thunder/transforms/quantization.py +++ b/thunder/transforms/quantization.py @@ -14,7 +14,8 @@ trace_with_replaced_proxy_metadata, ) -from .quantization_cpu import quantize_4bit_impl +from .quantization_cpu import quantize_4bit_cpu +from bitsandbytes.functional import QuantState, get_4bit_type bitsandbytes_executor = None @@ -55,17 +56,28 @@ def __init__(self): def quantize_weight(self, w): if w.device.type == "meta": - num_elements = w.numel() - return torch.empty((num_elements, 1), device="meta", dtype=torch.uint8) - - # CPU quantization without returning the quantization state. - # Currently, the quantization state is omitted for CPU as the primary goal is to optimize - # for inference. If the use case involves fine-tuning or dequantizing weights back to - # their original precision, it may be necessary to return the state. This can be revisited - # if future use cases require more flexibility, such as further model training or analysis - # of quantization effects on the CPU. + n = w.numel() + output_shape = ((n + 1) // 2,) + blocksize=64 + blocks = n // blocksize + blocks += 1 if n % blocksize > 0 else 0 + absmax = torch.zeros((blocks,), device=w.device, dtype=w.dtype) + quant_type="nf4" + code = get_4bit_type(quant_type, device=w.device) + + # Return only shape and dtype for meta tensors without calculation + state = QuantState( + absmax=absmax, + shape=w.shape, + dtype=w.dtype, + blocksize=64, + code=code, + quant_type=quant_type, + ) + return torch.empty(output_shape, device="meta", dtype=torch.uint8), state + if w.device.type == "cpu": - return quantize_4bit_impl(w, quant_type="nf4")[0] + return quantize_4bit_cpu(w, quant_type="nf4") if w.device.type != "cuda": with torch.no_grad(): diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py index a3a7d1883e..3b31f0c5ac 100644 --- a/thunder/transforms/quantization_cpu.py +++ b/thunder/transforms/quantization_cpu.py @@ -1,12 +1,37 @@ -# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the -# bitsandbytes library (https://github.com/bitsandbytes-foundation/bitsandbytes/tree/multi-backend-refactor). -# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation -# should be replaced with the official, upstream version to ensure better compatibility, performance, -# and future updates. -# Please track the progress of the bitsandbytes library and update this file when necessary. - +""" +Derivied from + https://github.com/bitsandbytes-foundation/bitsandbytes + +The code for CPU quantization in this file has been adapted from a not-yet-merged +multi-backend-refactor branch + +MIT License: + https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/LICENSE + +Copyright (c) Facebook, Inc. and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import subprocess +from typing import Literal, Optional, Tuple import warnings - import torch from bitsandbytes.functional import ( @@ -14,8 +39,40 @@ get_4bit_type, ) +try: + # to support Intel CPU/GPU (XPU) backend + import intel_extension_for_pytorch as ipex + + ipex_cpu = ipex if ipex._C._has_cpu() else None + ipex_xpu = ipex if ipex._C._has_xpu() else None +except BaseException: + ipex_cpu = None + ipex_xpu = None + +gxx_available = False +try: + subprocess.run(["g++", "--version"]) + gxx_available = True +except BaseException: + warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.") + Tensor = torch.Tensor +def _torch_version_prereq(major, minor): + ver_major = int(torch.__version__.split(".")[0]) + ver_minor = int(torch.__version__.split(".")[1]) + return ver_major * 32 + ver_minor >= major * 32 + minor + +def _maybe_torch_compile(func): + # torch.compile requires g++ and pytorch >= 2.0 + if gxx_available and _torch_version_prereq(2, 0): + options = {} + # fx_graph_cache requires pytorch >= 2.2 + if _torch_version_prereq(2, 2): + options.update({"fx_graph_cache": True}) + return torch.compile(func, dynamic=True, options=options) + return func + NF4_QUANT_TABLE = [ -1.0 - 1e-2, # 0b0000 -0.8480964004993439, # 0b0001 @@ -46,87 +103,35 @@ 0.8333333: 3, # 0b0011 } -def get_4bit_type(typename, device=None, blocksize=64): - if device is None: - device = "cuda" - data = None - if typename == "nf4": - """ Implements the NF4 data type. - - Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that - is normalized into the range [-1, 1]. - - For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314) - - Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in - the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. - """ - data = [ - -1.0, - -0.6961928009986877, - -0.5250730514526367, - -0.39491748809814453, - -0.28444138169288635, - -0.18477343022823334, - -0.09105003625154495, - 0.0, - 0.07958029955625534, - 0.16093020141124725, - 0.24611230194568634, - 0.33791524171829224, - 0.44070982933044434, - 0.5626170039176941, - 0.7229568362236023, - 1.0, - ] - elif typename == "fp4": - # 0b000 = 0 - # 0b001 = 0.0625 - # 0b010 = 8 - # 0b011 = 12 - # 0b100 = 4 - # 0b101 = 6 - # 0b110 = 2 - # 0b111 = 3 - # can also be created with bnb.functional.create_fp8_map(signed=True, exponent_bits=2, precision_bits=1, total_bits=4) - data = [0, 0.0625, 8.0, 12.0, 4.0, 6.0, 2.0, 3.0, -0, -0.0625, -8.0, -12.0, -4.0, -6.0, -2.0, -3.0] - elif typename == "int4": - data = [7, 6, 5, 4, 3, 2, 1, 0, -0, -1, -2, -3, -4, -5, -6, -7] - elif typename == "af4": - # Taken from: NF4 Isn't Information Theoretically Optimal (and that's Good) - # https://arxiv.org/abs/2306.06965 - if blocksize == 64: - data = [ - -1.0, - -0.69441008, - -0.51243739, - -0.3736951, - -0.25607552, - -0.14982478, - -0.04934812, - 0.0, - 0.04273164, - 0.12934483, - 0.21961274, - 0.31675666, - 0.42563882, - 0.55496234, - 0.72424863, - 1.0, - ][::-1] - else: - raise NotImplementedError("4-bit AbnormalFloats currently only support blocksize 64.") - - if data is None: - raise NotImplementedError(f"Typename {typename} not supported") - - data = torch.tensor(data, device=device) - data.div_(data.abs().max()) - - assert data.numel() == 16 - - return data +def assert_on_cpu(tensors): + on_cpu = True + for t in tensors: + if t is None: + continue # NULL pointers are fine + on_cpu &= t.device.type == "cpu" + if not on_cpu: + raise TypeError( + "All input tensors need to be on CPU, but found some tensors to not be on CPU:\n" + f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}" + ) + return on_cpu +def quantize_4bit_cpu( + A: torch.Tensor, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + blocksize=64, + compress_statistics=False, + quant_type: Literal["fp4", "nf4"] = "fp4", + quant_storage=torch.uint8, +) -> Tuple[torch.Tensor, QuantState]: + if blocksize is None: + blocksize = 64 + assert_on_cpu([A, absmax, out]) + assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage" + return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type) + +@_maybe_torch_compile def quantize_4bit_impl( A: Tensor, absmax: Tensor = None, From cdb1116b7c9a25531512ef2df7d39fc6702ae7ee Mon Sep 17 00:00:00 2001 From: Tomasz Bawor Date: Tue, 8 Oct 2024 19:19:50 +0200 Subject: [PATCH 4/9] update --- thunder/tests/test_quantization.py | 81 ------------------------------ 1 file changed, 81 deletions(-) delete mode 100644 thunder/tests/test_quantization.py diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py deleted file mode 100644 index d94a5c76fd..0000000000 --- a/thunder/tests/test_quantization.py +++ /dev/null @@ -1,81 +0,0 @@ -import torch -import time -from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit - - -def test_cpu_quantization(): - # Initialize quantization transform - quant_transform = BitsAndBytesLinearQuant4bit() - - # Create a tensor on CPU - weight = torch.randn(3, 3, device="cpu") - - # Quantize weight (expect only the quantized tensor, not a tuple) - quantized_weight = quant_transform.quantize_weight(weight) - - # Check that the quantized tensor has fewer or equal elements due to compression - original_num_elements = weight.numel() - quantized_num_elements = quantized_weight.numel() - - assert quantized_weight is not None, "Quantized weight is None" - assert ( - quantized_num_elements <= original_num_elements - ), "Quantized tensor should have fewer or equal elements due to compression" - - -def test_gpu_quantization(): - if not torch.cuda.is_available(): - return - - # Initialize quantization transform - quant_transform = BitsAndBytesLinearQuant4bit() - - # Create a tensor on GPU - weight = torch.randn(3, 3, device="cuda") - - # Quantize weight (expect only the quantized tensor, not a tuple) - quantized_weight = quant_transform.quantize_weight(weight)[0] - - # Check that the quantized tensor has fewer or equal elements due to compression - original_num_elements = weight.numel() - quantized_num_elements = quantized_weight.numel() - - assert quantized_weight is not None, "Quantized weight is None" - assert ( - quantized_num_elements <= original_num_elements - ), "Quantized tensor should have fewer or equal elements due to compression" - - -# Optional: Performance tests -def measure_time(device_type): - quant_transform = BitsAndBytesLinearQuant4bit() - - if device_type == "cuda" and torch.cuda.is_available(): - device = torch.device("cuda") - else: - device = torch.device("cpu") - - weight = torch.randn(1000, 1000, device=device) - - start_time = time.time() - quantized_weight = quant_transform.quantize_weight(weight) # Expect only the quantized tensor - end_time = time.time() - - print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds") - - -# Run functional tests -print("Testing CPU quantization:") -test_cpu_quantization() - -if torch.cuda.is_available(): - print("\nTesting GPU quantization:") - test_gpu_quantization() -else: - print("\nGPU not available, skipping GPU test.") - -# Run performance tests -print("\nMeasuring performance:") -measure_time("cpu") -if torch.cuda.is_available(): - measure_time("cuda") From d0b683fee9af2c039e9828d89214392f225388f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Oct 2024 21:49:34 +0000 Subject: [PATCH 5/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- thunder/transforms/quantization.py | 6 +++--- thunder/transforms/quantization_cpu.py | 14 ++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py index ff9e502554..c5ab06ea0b 100644 --- a/thunder/transforms/quantization.py +++ b/thunder/transforms/quantization.py @@ -58,11 +58,11 @@ def quantize_weight(self, w): if w.device.type == "meta": n = w.numel() output_shape = ((n + 1) // 2,) - blocksize=64 + blocksize = 64 blocks = n // blocksize blocks += 1 if n % blocksize > 0 else 0 absmax = torch.zeros((blocks,), device=w.device, dtype=w.dtype) - quant_type="nf4" + quant_type = "nf4" code = get_4bit_type(quant_type, device=w.device) # Return only shape and dtype for meta tensors without calculation @@ -78,7 +78,7 @@ def quantize_weight(self, w): if w.device.type == "cpu": return quantize_4bit_cpu(w, quant_type="nf4") - + if w.device.type != "cuda": with torch.no_grad(): w_work = w.to("cuda") diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py index 3b31f0c5ac..0e8e84837d 100644 --- a/thunder/transforms/quantization_cpu.py +++ b/thunder/transforms/quantization_cpu.py @@ -4,7 +4,7 @@ The code for CPU quantization in this file has been adapted from a not-yet-merged multi-backend-refactor branch - + MIT License: https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/LICENSE @@ -58,11 +58,13 @@ Tensor = torch.Tensor + def _torch_version_prereq(major, minor): ver_major = int(torch.__version__.split(".")[0]) ver_minor = int(torch.__version__.split(".")[1]) return ver_major * 32 + ver_minor >= major * 32 + minor + def _maybe_torch_compile(func): # torch.compile requires g++ and pytorch >= 2.0 if gxx_available and _torch_version_prereq(2, 0): @@ -73,6 +75,7 @@ def _maybe_torch_compile(func): return torch.compile(func, dynamic=True, options=options) return func + NF4_QUANT_TABLE = [ -1.0 - 1e-2, # 0b0000 -0.8480964004993439, # 0b0001 @@ -103,6 +106,7 @@ def _maybe_torch_compile(func): 0.8333333: 3, # 0b0011 } + def assert_on_cpu(tensors): on_cpu = True for t in tensors: @@ -116,21 +120,23 @@ def assert_on_cpu(tensors): ) return on_cpu + def quantize_4bit_cpu( A: torch.Tensor, - absmax: Optional[torch.Tensor] = None, - out: Optional[torch.Tensor] = None, + absmax: torch.Tensor | None = None, + out: torch.Tensor | None = None, blocksize=64, compress_statistics=False, quant_type: Literal["fp4", "nf4"] = "fp4", quant_storage=torch.uint8, -) -> Tuple[torch.Tensor, QuantState]: +) -> tuple[torch.Tensor, QuantState]: if blocksize is None: blocksize = 64 assert_on_cpu([A, absmax, out]) assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage" return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type) + @_maybe_torch_compile def quantize_4bit_impl( A: Tensor, From 63fc6d4b893685a499495e60128e80b3fb3bc930 Mon Sep 17 00:00:00 2001 From: Tomasz Bawor Date: Sat, 12 Oct 2024 04:03:24 +0200 Subject: [PATCH 6/9] meta and cpu shape update --- thunder/transforms/quantization.py | 2 +- thunder/transforms/quantization_cpu.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py index c5ab06ea0b..234244c044 100644 --- a/thunder/transforms/quantization.py +++ b/thunder/transforms/quantization.py @@ -57,7 +57,7 @@ def __init__(self): def quantize_weight(self, w): if w.device.type == "meta": n = w.numel() - output_shape = ((n + 1) // 2,) + output_shape = ((n + 1) // 2, 1) blocksize = 64 blocks = n // blocksize blocks += 1 if n % blocksize > 0 else 0 diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py index 0e8e84837d..b8df98a46f 100644 --- a/thunder/transforms/quantization_cpu.py +++ b/thunder/transforms/quantization_cpu.py @@ -185,7 +185,9 @@ def quantize_4bit_impl( absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype) if out is None: - out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device) + # change to 2D shape instead of unsqueeze(0) to be consistent with + # CUDA implementation in multi-backend-refactor branch + out = torch.zeros(((n + 1) // 2, 1), dtype=torch.uint8, device=A.device) rem = n % blocksize has_rem = rem > 0 @@ -230,4 +232,4 @@ def quantize_4bit_impl( quant_type=quant_type, ) - return out.unsqueeze(0), state + return out, state From b4e7daee697d4529beaa75c4d69d686730e58956 Mon Sep 17 00:00:00 2001 From: Tomasz Bawor Date: Mon, 21 Oct 2024 12:20:05 +0200 Subject: [PATCH 7/9] META, CPU and CUDA consistency changes --- thunder/transforms/quantization.py | 3 ++- thunder/transforms/quantization_cpu.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py index 234244c044..a90e8a3c51 100644 --- a/thunder/transforms/quantization.py +++ b/thunder/transforms/quantization.py @@ -61,7 +61,8 @@ def quantize_weight(self, w): blocksize = 64 blocks = n // blocksize blocks += 1 if n % blocksize > 0 else 0 - absmax = torch.zeros((blocks,), device=w.device, dtype=w.dtype) + # cuda absmax dtype is torch.float32 instead of dtype=A.dtype + absmax = torch.zeros((blocks,), device=w.device, dtype=torch.float32) quant_type = "nf4" code = get_4bit_type(quant_type, device=w.device) diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py index b8df98a46f..a5f1fabfb8 100644 --- a/thunder/transforms/quantization_cpu.py +++ b/thunder/transforms/quantization_cpu.py @@ -182,7 +182,8 @@ def quantize_4bit_impl( blocks += 1 if n % blocksize > 0 else 0 if absmax is None: - absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype) + # cuda absmax dtype is torch.float32 instead of dtype=A.dtype + absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32) if out is None: # change to 2D shape instead of unsqueeze(0) to be consistent with @@ -216,7 +217,14 @@ def quantize_4bit_impl( out_uint8 += sign.to(torch.uint8) * 8 if out_uint8.size(-1) % 2: out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0) - out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2]) + + # Perform the bitwise operations + result = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2]) + + # Reshape the result to a 2D tensor with shape [N, 1] + # CUDA out is 2D tensor + out[:] = result.view(-1, 1) + # out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2]) code = get_4bit_type(quant_type, device=A.device) From 4b15a72cd41bec180b39a4215551b124c8950713 Mon Sep 17 00:00:00 2001 From: Tomasz Bawor Date: Sun, 3 Nov 2024 20:06:25 +0100 Subject: [PATCH 8/9] bitsandbytes import update --- thunder/transforms/quantization.py | 5 ++--- thunder/transforms/quantization_cpu.py | 11 ++++------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py index a90e8a3c51..647945fa5c 100644 --- a/thunder/transforms/quantization.py +++ b/thunder/transforms/quantization.py @@ -15,7 +15,6 @@ ) from .quantization_cpu import quantize_4bit_cpu -from bitsandbytes.functional import QuantState, get_4bit_type bitsandbytes_executor = None @@ -64,10 +63,10 @@ def quantize_weight(self, w): # cuda absmax dtype is torch.float32 instead of dtype=A.dtype absmax = torch.zeros((blocks,), device=w.device, dtype=torch.float32) quant_type = "nf4" - code = get_4bit_type(quant_type, device=w.device) + code = bitsandbytes.functional.get_4bit_type(quant_type, device=w.device) # Return only shape and dtype for meta tensors without calculation - state = QuantState( + state = bitsandbytes.functional.QuantState( absmax=absmax, shape=w.shape, dtype=w.dtype, diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py index a5f1fabfb8..33f0b0b671 100644 --- a/thunder/transforms/quantization_cpu.py +++ b/thunder/transforms/quantization_cpu.py @@ -34,10 +34,7 @@ import warnings import torch -from bitsandbytes.functional import ( - QuantState, - get_4bit_type, -) +import bitsandbytes try: # to support Intel CPU/GPU (XPU) backend @@ -129,7 +126,7 @@ def quantize_4bit_cpu( compress_statistics=False, quant_type: Literal["fp4", "nf4"] = "fp4", quant_storage=torch.uint8, -) -> tuple[torch.Tensor, QuantState]: +) -> tuple[torch.Tensor, bitsandbytes.functional.QuantState]: if blocksize is None: blocksize = 64 assert_on_cpu([A, absmax, out]) @@ -226,12 +223,12 @@ def quantize_4bit_impl( out[:] = result.view(-1, 1) # out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2]) - code = get_4bit_type(quant_type, device=A.device) + code = bitsandbytes.functional.get_4bit_type(quant_type, device=A.device) if compress_statistics: raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU") else: - state = QuantState( + state = bitsandbytes.functional.QuantState( absmax=absmax, shape=input_shape, dtype=A.dtype, From dbca6e917a59c9ae2bc4f487f74c62a2a3254f11 Mon Sep 17 00:00:00 2001 From: Tomasz Bawor Date: Sun, 3 Nov 2024 20:35:03 +0100 Subject: [PATCH 9/9] update --- thunder/transforms/quantization_cpu.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py index 33f0b0b671..a5f1fabfb8 100644 --- a/thunder/transforms/quantization_cpu.py +++ b/thunder/transforms/quantization_cpu.py @@ -34,7 +34,10 @@ import warnings import torch -import bitsandbytes +from bitsandbytes.functional import ( + QuantState, + get_4bit_type, +) try: # to support Intel CPU/GPU (XPU) backend @@ -126,7 +129,7 @@ def quantize_4bit_cpu( compress_statistics=False, quant_type: Literal["fp4", "nf4"] = "fp4", quant_storage=torch.uint8, -) -> tuple[torch.Tensor, bitsandbytes.functional.QuantState]: +) -> tuple[torch.Tensor, QuantState]: if blocksize is None: blocksize = 64 assert_on_cpu([A, absmax, out]) @@ -223,12 +226,12 @@ def quantize_4bit_impl( out[:] = result.view(-1, 1) # out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2]) - code = bitsandbytes.functional.get_4bit_type(quant_type, device=A.device) + code = get_4bit_type(quant_type, device=A.device) if compress_statistics: raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU") else: - state = bitsandbytes.functional.QuantState( + state = QuantState( absmax=absmax, shape=input_shape, dtype=A.dtype,