From 1e28a0db20cc97bc05292f6d89b0bcd1f433a354 Mon Sep 17 00:00:00 2001
From: Tomasz Bawor <tomasz.bawor@Siriusxm.com>
Date: Mon, 23 Sep 2024 20:22:26 +0200
Subject: [PATCH 1/9] quantization_cpu base version

---
 thunder/tests/test_quantization.py     |  73 ++++++++
 thunder/transforms/quantization.py     |  17 +-
 thunder/transforms/quantization_cpu.py | 222 +++++++++++++++++++++++++
 3 files changed, 309 insertions(+), 3 deletions(-)
 create mode 100644 thunder/tests/test_quantization.py
 create mode 100644 thunder/transforms/quantization_cpu.py

diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py
new file mode 100644
index 0000000000..832d9b0e1d
--- /dev/null
+++ b/thunder/tests/test_quantization.py
@@ -0,0 +1,73 @@
+import torch
+import time
+from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit
+
+def test_cpu_quantization():
+    # Initialize quantization transform
+    quant_transform = BitsAndBytesLinearQuant4bit()
+
+    # Create a tensor on CPU
+    weight = torch.randn(3, 3, device='cpu')
+
+    # Quantize weight (expect only the quantized tensor, not a tuple)
+    quantized_weight = quant_transform.quantize_weight(weight)
+
+    # Check that the quantized tensor has fewer or equal elements due to compression
+    original_num_elements = weight.numel()
+    quantized_num_elements = quantized_weight.numel()
+    
+    assert quantized_weight is not None, "Quantized weight is None"
+    assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression"
+
+def test_gpu_quantization():
+    if not torch.cuda.is_available():
+        return
+
+    # Initialize quantization transform
+    quant_transform = BitsAndBytesLinearQuant4bit()
+
+    # Create a tensor on GPU
+    weight = torch.randn(3, 3, device='cuda')
+
+    # Quantize weight (expect only the quantized tensor, not a tuple)
+    quantized_weight = quant_transform.quantize_weight(weight)[0]
+
+    # Check that the quantized tensor has fewer or equal elements due to compression
+    original_num_elements = weight.numel()
+    quantized_num_elements = quantized_weight.numel()
+
+    assert quantized_weight is not None, "Quantized weight is None"
+    assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression"
+
+# Optional: Performance tests
+def measure_time(device_type):
+    quant_transform = BitsAndBytesLinearQuant4bit()
+
+    if device_type == 'cuda' and torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+
+    weight = torch.randn(1000, 1000, device=device)
+
+    start_time = time.time()
+    quantized_weight = quant_transform.quantize_weight(weight)  # Expect only the quantized tensor
+    end_time = time.time()
+
+    print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds")
+
+# Run functional tests
+print("Testing CPU quantization:")
+test_cpu_quantization()
+
+if torch.cuda.is_available():
+    print("\nTesting GPU quantization:")
+    test_gpu_quantization()
+else:
+    print("\nGPU not available, skipping GPU test.")
+
+# Run performance tests
+print("\nMeasuring performance:")
+measure_time('cpu')
+if torch.cuda.is_available():
+    measure_time('cuda')
diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py
index f4eebfb752..31d0ab977e 100644
--- a/thunder/transforms/quantization.py
+++ b/thunder/transforms/quantization.py
@@ -14,6 +14,7 @@
     add_trace_output,
 )
 
+from .quantization_cpu import quantize_4bit_impl
 
 bitsandbytes_executor = None
 
@@ -91,10 +92,20 @@ def __init__(self):
         get_bitsandbytes_executor()
 
     def quantize_weight(self, w):
-        # todo: revisit staying on CPU when bnb supports it
         if w.device.type == "meta":
-            w_work = torch.zeros_like(w, device="cuda")
-        elif w.device.type != "cuda":
+            num_elements = w.numel()
+            return torch.empty((num_elements, 1), device="meta", dtype=torch.uint8)
+        
+        # CPU quantization without returning the quantization state.
+        # Currently, the quantization state is omitted for CPU as the primary goal is to optimize 
+        # for inference. If the use case involves fine-tuning or dequantizing weights back to 
+        # their original precision, it may be necessary to return the state. This can be revisited
+        # if future use cases require more flexibility, such as further model training or analysis 
+        # of quantization effects on the CPU.
+        if w.device.type == "cpu":
+            return quantize_4bit_impl(w, quant_type="nf4")[0]
+        
+        if w.device.type != "cuda":
             with torch.no_grad():
                 w_work = w.to("cuda")
         else:
diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
new file mode 100644
index 0000000000..a3a7d1883e
--- /dev/null
+++ b/thunder/transforms/quantization_cpu.py
@@ -0,0 +1,222 @@
+# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the 
+# bitsandbytes library (https://github.com/bitsandbytes-foundation/bitsandbytes/tree/multi-backend-refactor).
+# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation 
+# should be replaced with the official, upstream version to ensure better compatibility, performance, 
+# and future updates.
+# Please track the progress of the bitsandbytes library and update this file when necessary.
+
+import warnings
+
+import torch
+
+from bitsandbytes.functional import (
+    QuantState,
+    get_4bit_type,
+)
+
+Tensor = torch.Tensor
+
+NF4_QUANT_TABLE = [
+    -1.0 - 1e-2,  # 0b0000
+    -0.8480964004993439,  # 0b0001
+    -0.6106329262256622,  # 0b0010
+    -0.4599952697753906,  # 0b0011
+    -0.33967943489551544,  # 0b0100
+    -0.23460740596055984,  # 0b0101
+    -0.13791173323988914,  # 0b0110
+    -0.045525018125772476,  # 0b0111
+    0.03979014977812767,  # 0b1000
+    0.1202552504837513,  # 0b1001
+    0.2035212516784668,  # 0b1010
+    0.2920137718319893,  # 0b1011
+    0.3893125355243683,  # 0b1100
+    0.5016634166240692,  # 0b1101
+    0.6427869200706482,  # 0b1110
+    0.8614784181118011,  # 0b1111
+]
+
+FP4_QUANT_TABLE = {
+    0 - 1e-2: 0,  # 0b0000
+    0.00260417: 1,  # 0b0001
+    0.0859375: 6,  # 0b0110
+    0.20833333: 7,  # 0b0111
+    0.29166667: 4,  # 0b0100
+    0.4166667: 5,  # 0b0101
+    0.583333: 2,  # 0b0010
+    0.8333333: 3,  # 0b0011
+}
+
+def get_4bit_type(typename, device=None, blocksize=64):
+    if device is None:
+        device = "cuda"
+    data = None
+    if typename == "nf4":
+        """ Implements the NF4 data type.
+
+            Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that
+            is normalized into the range [-1, 1].
+
+            For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314)
+
+            Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
+            the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
+        """
+        data = [
+            -1.0,
+            -0.6961928009986877,
+            -0.5250730514526367,
+            -0.39491748809814453,
+            -0.28444138169288635,
+            -0.18477343022823334,
+            -0.09105003625154495,
+            0.0,
+            0.07958029955625534,
+            0.16093020141124725,
+            0.24611230194568634,
+            0.33791524171829224,
+            0.44070982933044434,
+            0.5626170039176941,
+            0.7229568362236023,
+            1.0,
+        ]
+    elif typename == "fp4":
+        # 0b000 = 0
+        # 0b001 = 0.0625
+        # 0b010 = 8
+        # 0b011 = 12
+        # 0b100 = 4
+        # 0b101 = 6
+        # 0b110 = 2
+        # 0b111 = 3
+        # can also be created with bnb.functional.create_fp8_map(signed=True, exponent_bits=2, precision_bits=1, total_bits=4)
+        data = [0, 0.0625, 8.0, 12.0, 4.0, 6.0, 2.0, 3.0, -0, -0.0625, -8.0, -12.0, -4.0, -6.0, -2.0, -3.0]
+    elif typename == "int4":
+        data = [7, 6, 5, 4, 3, 2, 1, 0, -0, -1, -2, -3, -4, -5, -6, -7]
+    elif typename == "af4":
+        # Taken from: NF4 Isn't Information Theoretically Optimal (and that's Good)
+        # https://arxiv.org/abs/2306.06965
+        if blocksize == 64:
+            data = [
+                -1.0,
+                -0.69441008,
+                -0.51243739,
+                -0.3736951,
+                -0.25607552,
+                -0.14982478,
+                -0.04934812,
+                0.0,
+                0.04273164,
+                0.12934483,
+                0.21961274,
+                0.31675666,
+                0.42563882,
+                0.55496234,
+                0.72424863,
+                1.0,
+            ][::-1]
+        else:
+            raise NotImplementedError("4-bit AbnormalFloats currently only support blocksize 64.")
+
+    if data is None:
+        raise NotImplementedError(f"Typename {typename} not supported")
+
+    data = torch.tensor(data, device=device)
+    data.div_(data.abs().max())
+
+    assert data.numel() == 16
+
+    return data
+
+def quantize_4bit_impl(
+    A: Tensor,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize=64,
+    compress_statistics=False,
+    quant_type="nf4",
+) -> Tensor:
+    """
+    Quantize tensor A in blocks of 4-bit values.
+
+    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        The output tensor (8-bit).
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+
+    Returns
+    -------
+    torch.Tensor:
+        The 8-bit tensor with packed 4-bit values.
+    tuple(torch.Tensor, torch.Size, torch.dtype, int):
+        The quantization state to undo the quantization.
+    """
+    if quant_type not in ["nf4", "fp4"]:
+        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU.")
+    if quant_type == "fp4":
+        warnings.warn("fp4 quantization is currently slow on CPU/XPU. Please Use nf4 instead for better performance.")
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+    n = A.numel()
+    input_shape = A.shape
+    blocks = n // blocksize
+    blocks += 1 if n % blocksize > 0 else 0
+
+    if absmax is None:
+        absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype)
+
+    if out is None:
+        out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
+
+    rem = n % blocksize
+    has_rem = rem > 0
+
+    # Scale tensor to [-1, 1]
+    A_reshaped = A.reshape(n)
+    A_com = A_reshaped[: n - rem]
+    A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+    absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1)
+    scaled_A = scaled_A.reshape(-1)
+    if has_rem:
+        absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+        scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
+        scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+    # map [-1, 1] to nf4/fp4
+    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
+    if quant_type == "nf4":
+        for i in range(len(NF4_QUANT_TABLE)):
+            out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    elif quant_type == "fp4":
+        sign = scaled_A < 0
+        abs_scaled_A = torch.abs(scaled_A)
+        for key, val in FP4_QUANT_TABLE.items():
+            out_uint8[abs_scaled_A > key] = val
+        out_uint8 += sign.to(torch.uint8) * 8
+    if out_uint8.size(-1) % 2:
+        out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
+    out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+
+    code = get_4bit_type(quant_type, device=A.device)
+
+    if compress_statistics:
+        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+    else:
+        state = QuantState(
+            absmax=absmax,
+            shape=input_shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+        )
+
+    return out.unsqueeze(0), state

From 688c3237b1f0a9e7a90b21b86c2da56f0c877eab Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 23 Sep 2024 21:47:13 +0000
Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 thunder/tests/test_quantization.py     | 28 +++++++++++++++++---------
 thunder/transforms/quantization.py     | 10 ++++-----
 thunder/transforms/quantization_cpu.py | 20 +++++++++---------
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py
index 832d9b0e1d..d94a5c76fd 100644
--- a/thunder/tests/test_quantization.py
+++ b/thunder/tests/test_quantization.py
@@ -2,12 +2,13 @@
 import time
 from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit
 
+
 def test_cpu_quantization():
     # Initialize quantization transform
     quant_transform = BitsAndBytesLinearQuant4bit()
 
     # Create a tensor on CPU
-    weight = torch.randn(3, 3, device='cpu')
+    weight = torch.randn(3, 3, device="cpu")
 
     # Quantize weight (expect only the quantized tensor, not a tuple)
     quantized_weight = quant_transform.quantize_weight(weight)
@@ -15,9 +16,12 @@ def test_cpu_quantization():
     # Check that the quantized tensor has fewer or equal elements due to compression
     original_num_elements = weight.numel()
     quantized_num_elements = quantized_weight.numel()
-    
+
     assert quantized_weight is not None, "Quantized weight is None"
-    assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression"
+    assert (
+        quantized_num_elements <= original_num_elements
+    ), "Quantized tensor should have fewer or equal elements due to compression"
+
 
 def test_gpu_quantization():
     if not torch.cuda.is_available():
@@ -27,7 +31,7 @@ def test_gpu_quantization():
     quant_transform = BitsAndBytesLinearQuant4bit()
 
     # Create a tensor on GPU
-    weight = torch.randn(3, 3, device='cuda')
+    weight = torch.randn(3, 3, device="cuda")
 
     # Quantize weight (expect only the quantized tensor, not a tuple)
     quantized_weight = quant_transform.quantize_weight(weight)[0]
@@ -37,16 +41,19 @@ def test_gpu_quantization():
     quantized_num_elements = quantized_weight.numel()
 
     assert quantized_weight is not None, "Quantized weight is None"
-    assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression"
+    assert (
+        quantized_num_elements <= original_num_elements
+    ), "Quantized tensor should have fewer or equal elements due to compression"
+
 
 # Optional: Performance tests
 def measure_time(device_type):
     quant_transform = BitsAndBytesLinearQuant4bit()
 
-    if device_type == 'cuda' and torch.cuda.is_available():
-        device = torch.device('cuda')
+    if device_type == "cuda" and torch.cuda.is_available():
+        device = torch.device("cuda")
     else:
-        device = torch.device('cpu')
+        device = torch.device("cpu")
 
     weight = torch.randn(1000, 1000, device=device)
 
@@ -56,6 +63,7 @@ def measure_time(device_type):
 
     print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds")
 
+
 # Run functional tests
 print("Testing CPU quantization:")
 test_cpu_quantization()
@@ -68,6 +76,6 @@ def measure_time(device_type):
 
 # Run performance tests
 print("\nMeasuring performance:")
-measure_time('cpu')
+measure_time("cpu")
 if torch.cuda.is_available():
-    measure_time('cuda')
+    measure_time("cuda")
diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py
index 31d0ab977e..c07508583f 100644
--- a/thunder/transforms/quantization.py
+++ b/thunder/transforms/quantization.py
@@ -95,16 +95,16 @@ def quantize_weight(self, w):
         if w.device.type == "meta":
             num_elements = w.numel()
             return torch.empty((num_elements, 1), device="meta", dtype=torch.uint8)
-        
+
         # CPU quantization without returning the quantization state.
-        # Currently, the quantization state is omitted for CPU as the primary goal is to optimize 
-        # for inference. If the use case involves fine-tuning or dequantizing weights back to 
+        # Currently, the quantization state is omitted for CPU as the primary goal is to optimize
+        # for inference. If the use case involves fine-tuning or dequantizing weights back to
         # their original precision, it may be necessary to return the state. This can be revisited
-        # if future use cases require more flexibility, such as further model training or analysis 
+        # if future use cases require more flexibility, such as further model training or analysis
         # of quantization effects on the CPU.
         if w.device.type == "cpu":
             return quantize_4bit_impl(w, quant_type="nf4")[0]
-        
+
         if w.device.type != "cuda":
             with torch.no_grad():
                 w_work = w.to("cuda")
diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
index a3a7d1883e..2d19d7454c 100644
--- a/thunder/transforms/quantization_cpu.py
+++ b/thunder/transforms/quantization_cpu.py
@@ -1,7 +1,7 @@
-# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the 
+# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the
 # bitsandbytes library (https://github.com/bitsandbytes-foundation/bitsandbytes/tree/multi-backend-refactor).
-# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation 
-# should be replaced with the official, upstream version to ensure better compatibility, performance, 
+# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation
+# should be replaced with the official, upstream version to ensure better compatibility, performance,
 # and future updates.
 # Please track the progress of the bitsandbytes library and update this file when necessary.
 
@@ -46,20 +46,21 @@
     0.8333333: 3,  # 0b0011
 }
 
+
 def get_4bit_type(typename, device=None, blocksize=64):
     if device is None:
         device = "cuda"
     data = None
     if typename == "nf4":
-        """ Implements the NF4 data type.
+        """Implements the NF4 data type.
 
-            Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that
-            is normalized into the range [-1, 1].
+        Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that
+        is normalized into the range [-1, 1].
 
-            For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314)
+        For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314)
 
-            Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
-            the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
+        Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
+        the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
         """
         data = [
             -1.0,
@@ -127,6 +128,7 @@ def get_4bit_type(typename, device=None, blocksize=64):
 
     return data
 
+
 def quantize_4bit_impl(
     A: Tensor,
     absmax: Tensor = None,

From d3bcbf9275c0f8d8b6d7711607bca8bdba522d55 Mon Sep 17 00:00:00 2001
From: Tomasz Bawor <tomasz.bawor@Siriusxm.com>
Date: Tue, 8 Oct 2024 19:10:23 +0200
Subject: [PATCH 3/9] quantize_weight update for meta andd cpu

---
 thunder/tests/test_quantization.py     |  73 ----------
 thunder/transforms/quantization.py     |  34 +++--
 thunder/transforms/quantization_cpu.py | 181 +++++++++++++------------
 3 files changed, 116 insertions(+), 172 deletions(-)
 delete mode 100644 thunder/tests/test_quantization.py

diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py
deleted file mode 100644
index 832d9b0e1d..0000000000
--- a/thunder/tests/test_quantization.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import torch
-import time
-from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit
-
-def test_cpu_quantization():
-    # Initialize quantization transform
-    quant_transform = BitsAndBytesLinearQuant4bit()
-
-    # Create a tensor on CPU
-    weight = torch.randn(3, 3, device='cpu')
-
-    # Quantize weight (expect only the quantized tensor, not a tuple)
-    quantized_weight = quant_transform.quantize_weight(weight)
-
-    # Check that the quantized tensor has fewer or equal elements due to compression
-    original_num_elements = weight.numel()
-    quantized_num_elements = quantized_weight.numel()
-    
-    assert quantized_weight is not None, "Quantized weight is None"
-    assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression"
-
-def test_gpu_quantization():
-    if not torch.cuda.is_available():
-        return
-
-    # Initialize quantization transform
-    quant_transform = BitsAndBytesLinearQuant4bit()
-
-    # Create a tensor on GPU
-    weight = torch.randn(3, 3, device='cuda')
-
-    # Quantize weight (expect only the quantized tensor, not a tuple)
-    quantized_weight = quant_transform.quantize_weight(weight)[0]
-
-    # Check that the quantized tensor has fewer or equal elements due to compression
-    original_num_elements = weight.numel()
-    quantized_num_elements = quantized_weight.numel()
-
-    assert quantized_weight is not None, "Quantized weight is None"
-    assert quantized_num_elements <= original_num_elements, "Quantized tensor should have fewer or equal elements due to compression"
-
-# Optional: Performance tests
-def measure_time(device_type):
-    quant_transform = BitsAndBytesLinearQuant4bit()
-
-    if device_type == 'cuda' and torch.cuda.is_available():
-        device = torch.device('cuda')
-    else:
-        device = torch.device('cpu')
-
-    weight = torch.randn(1000, 1000, device=device)
-
-    start_time = time.time()
-    quantized_weight = quant_transform.quantize_weight(weight)  # Expect only the quantized tensor
-    end_time = time.time()
-
-    print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds")
-
-# Run functional tests
-print("Testing CPU quantization:")
-test_cpu_quantization()
-
-if torch.cuda.is_available():
-    print("\nTesting GPU quantization:")
-    test_gpu_quantization()
-else:
-    print("\nGPU not available, skipping GPU test.")
-
-# Run performance tests
-print("\nMeasuring performance:")
-measure_time('cpu')
-if torch.cuda.is_available():
-    measure_time('cuda')
diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py
index 902d44d41e..ff9e502554 100644
--- a/thunder/transforms/quantization.py
+++ b/thunder/transforms/quantization.py
@@ -14,7 +14,8 @@
     trace_with_replaced_proxy_metadata,
 )
 
-from .quantization_cpu import quantize_4bit_impl
+from .quantization_cpu import quantize_4bit_cpu
+from bitsandbytes.functional import QuantState, get_4bit_type
 
 bitsandbytes_executor = None
 
@@ -55,17 +56,28 @@ def __init__(self):
 
     def quantize_weight(self, w):
         if w.device.type == "meta":
-            num_elements = w.numel()
-            return torch.empty((num_elements, 1), device="meta", dtype=torch.uint8)
-        
-        # CPU quantization without returning the quantization state.
-        # Currently, the quantization state is omitted for CPU as the primary goal is to optimize 
-        # for inference. If the use case involves fine-tuning or dequantizing weights back to 
-        # their original precision, it may be necessary to return the state. This can be revisited
-        # if future use cases require more flexibility, such as further model training or analysis 
-        # of quantization effects on the CPU.
+            n = w.numel()
+            output_shape = ((n + 1) // 2,)
+            blocksize=64
+            blocks = n // blocksize
+            blocks += 1 if n % blocksize > 0 else 0
+            absmax = torch.zeros((blocks,), device=w.device, dtype=w.dtype)
+            quant_type="nf4"
+            code = get_4bit_type(quant_type, device=w.device)
+
+            # Return only shape and dtype for meta tensors without calculation
+            state = QuantState(
+                absmax=absmax,
+                shape=w.shape,
+                dtype=w.dtype,
+                blocksize=64,
+                code=code,
+                quant_type=quant_type,
+            )
+            return torch.empty(output_shape, device="meta", dtype=torch.uint8), state
+
         if w.device.type == "cpu":
-            return quantize_4bit_impl(w, quant_type="nf4")[0]
+            return quantize_4bit_cpu(w, quant_type="nf4")
         
         if w.device.type != "cuda":
             with torch.no_grad():
diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
index a3a7d1883e..3b31f0c5ac 100644
--- a/thunder/transforms/quantization_cpu.py
+++ b/thunder/transforms/quantization_cpu.py
@@ -1,12 +1,37 @@
-# NOTE: The code for CPU quantization in this file has been adapted from a not-yet-merged branch of the 
-# bitsandbytes library (https://github.com/bitsandbytes-foundation/bitsandbytes/tree/multi-backend-refactor).
-# Once the changes in that branch are merged into the main bitsandbytes repository, this implementation 
-# should be replaced with the official, upstream version to ensure better compatibility, performance, 
-# and future updates.
-# Please track the progress of the bitsandbytes library and update this file when necessary.
-
+"""
+Derivied from
+    https://github.com/bitsandbytes-foundation/bitsandbytes
+
+The code for CPU quantization in this file has been adapted from a not-yet-merged
+multi-backend-refactor branch
+    
+MIT License:
+    https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/LICENSE
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import subprocess
+from typing import Literal, Optional, Tuple
 import warnings
-
 import torch
 
 from bitsandbytes.functional import (
@@ -14,8 +39,40 @@
     get_4bit_type,
 )
 
+try:
+    # to support Intel CPU/GPU (XPU) backend
+    import intel_extension_for_pytorch as ipex
+
+    ipex_cpu = ipex if ipex._C._has_cpu() else None
+    ipex_xpu = ipex if ipex._C._has_xpu() else None
+except BaseException:
+    ipex_cpu = None
+    ipex_xpu = None
+
+gxx_available = False
+try:
+    subprocess.run(["g++", "--version"])
+    gxx_available = True
+except BaseException:
+    warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
+
 Tensor = torch.Tensor
 
+def _torch_version_prereq(major, minor):
+    ver_major = int(torch.__version__.split(".")[0])
+    ver_minor = int(torch.__version__.split(".")[1])
+    return ver_major * 32 + ver_minor >= major * 32 + minor
+
+def _maybe_torch_compile(func):
+    # torch.compile requires g++ and pytorch >= 2.0
+    if gxx_available and _torch_version_prereq(2, 0):
+        options = {}
+        # fx_graph_cache requires pytorch >= 2.2
+        if _torch_version_prereq(2, 2):
+            options.update({"fx_graph_cache": True})
+        return torch.compile(func, dynamic=True, options=options)
+    return func
+
 NF4_QUANT_TABLE = [
     -1.0 - 1e-2,  # 0b0000
     -0.8480964004993439,  # 0b0001
@@ -46,87 +103,35 @@
     0.8333333: 3,  # 0b0011
 }
 
-def get_4bit_type(typename, device=None, blocksize=64):
-    if device is None:
-        device = "cuda"
-    data = None
-    if typename == "nf4":
-        """ Implements the NF4 data type.
-
-            Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that
-            is normalized into the range [-1, 1].
-
-            For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314)
-
-            Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
-            the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
-        """
-        data = [
-            -1.0,
-            -0.6961928009986877,
-            -0.5250730514526367,
-            -0.39491748809814453,
-            -0.28444138169288635,
-            -0.18477343022823334,
-            -0.09105003625154495,
-            0.0,
-            0.07958029955625534,
-            0.16093020141124725,
-            0.24611230194568634,
-            0.33791524171829224,
-            0.44070982933044434,
-            0.5626170039176941,
-            0.7229568362236023,
-            1.0,
-        ]
-    elif typename == "fp4":
-        # 0b000 = 0
-        # 0b001 = 0.0625
-        # 0b010 = 8
-        # 0b011 = 12
-        # 0b100 = 4
-        # 0b101 = 6
-        # 0b110 = 2
-        # 0b111 = 3
-        # can also be created with bnb.functional.create_fp8_map(signed=True, exponent_bits=2, precision_bits=1, total_bits=4)
-        data = [0, 0.0625, 8.0, 12.0, 4.0, 6.0, 2.0, 3.0, -0, -0.0625, -8.0, -12.0, -4.0, -6.0, -2.0, -3.0]
-    elif typename == "int4":
-        data = [7, 6, 5, 4, 3, 2, 1, 0, -0, -1, -2, -3, -4, -5, -6, -7]
-    elif typename == "af4":
-        # Taken from: NF4 Isn't Information Theoretically Optimal (and that's Good)
-        # https://arxiv.org/abs/2306.06965
-        if blocksize == 64:
-            data = [
-                -1.0,
-                -0.69441008,
-                -0.51243739,
-                -0.3736951,
-                -0.25607552,
-                -0.14982478,
-                -0.04934812,
-                0.0,
-                0.04273164,
-                0.12934483,
-                0.21961274,
-                0.31675666,
-                0.42563882,
-                0.55496234,
-                0.72424863,
-                1.0,
-            ][::-1]
-        else:
-            raise NotImplementedError("4-bit AbnormalFloats currently only support blocksize 64.")
-
-    if data is None:
-        raise NotImplementedError(f"Typename {typename} not supported")
-
-    data = torch.tensor(data, device=device)
-    data.div_(data.abs().max())
-
-    assert data.numel() == 16
-
-    return data
+def assert_on_cpu(tensors):
+    on_cpu = True
+    for t in tensors:
+        if t is None:
+            continue  # NULL pointers are fine
+        on_cpu &= t.device.type == "cpu"
+    if not on_cpu:
+        raise TypeError(
+            "All input tensors need to be on CPU, but found some tensors to not be on CPU:\n"
+            f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}"
+        )
+    return on_cpu
 
+def quantize_4bit_cpu(
+    A: torch.Tensor,
+    absmax: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    blocksize=64,
+    compress_statistics=False,
+    quant_type: Literal["fp4", "nf4"] = "fp4",
+    quant_storage=torch.uint8,
+) -> Tuple[torch.Tensor, QuantState]:
+    if blocksize is None:
+        blocksize = 64
+    assert_on_cpu([A, absmax, out])
+    assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
+    return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+
+@_maybe_torch_compile
 def quantize_4bit_impl(
     A: Tensor,
     absmax: Tensor = None,

From cdb1116b7c9a25531512ef2df7d39fc6702ae7ee Mon Sep 17 00:00:00 2001
From: Tomasz Bawor <tomasz.bawor@Siriusxm.com>
Date: Tue, 8 Oct 2024 19:19:50 +0200
Subject: [PATCH 4/9] update

---
 thunder/tests/test_quantization.py | 81 ------------------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 thunder/tests/test_quantization.py

diff --git a/thunder/tests/test_quantization.py b/thunder/tests/test_quantization.py
deleted file mode 100644
index d94a5c76fd..0000000000
--- a/thunder/tests/test_quantization.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-import time
-from thunder.transforms.quantization import BitsAndBytesLinearQuant4bit
-
-
-def test_cpu_quantization():
-    # Initialize quantization transform
-    quant_transform = BitsAndBytesLinearQuant4bit()
-
-    # Create a tensor on CPU
-    weight = torch.randn(3, 3, device="cpu")
-
-    # Quantize weight (expect only the quantized tensor, not a tuple)
-    quantized_weight = quant_transform.quantize_weight(weight)
-
-    # Check that the quantized tensor has fewer or equal elements due to compression
-    original_num_elements = weight.numel()
-    quantized_num_elements = quantized_weight.numel()
-
-    assert quantized_weight is not None, "Quantized weight is None"
-    assert (
-        quantized_num_elements <= original_num_elements
-    ), "Quantized tensor should have fewer or equal elements due to compression"
-
-
-def test_gpu_quantization():
-    if not torch.cuda.is_available():
-        return
-
-    # Initialize quantization transform
-    quant_transform = BitsAndBytesLinearQuant4bit()
-
-    # Create a tensor on GPU
-    weight = torch.randn(3, 3, device="cuda")
-
-    # Quantize weight (expect only the quantized tensor, not a tuple)
-    quantized_weight = quant_transform.quantize_weight(weight)[0]
-
-    # Check that the quantized tensor has fewer or equal elements due to compression
-    original_num_elements = weight.numel()
-    quantized_num_elements = quantized_weight.numel()
-
-    assert quantized_weight is not None, "Quantized weight is None"
-    assert (
-        quantized_num_elements <= original_num_elements
-    ), "Quantized tensor should have fewer or equal elements due to compression"
-
-
-# Optional: Performance tests
-def measure_time(device_type):
-    quant_transform = BitsAndBytesLinearQuant4bit()
-
-    if device_type == "cuda" and torch.cuda.is_available():
-        device = torch.device("cuda")
-    else:
-        device = torch.device("cpu")
-
-    weight = torch.randn(1000, 1000, device=device)
-
-    start_time = time.time()
-    quantized_weight = quant_transform.quantize_weight(weight)  # Expect only the quantized tensor
-    end_time = time.time()
-
-    print(f"Quantization time on {device_type}: {end_time - start_time:.4f} seconds")
-
-
-# Run functional tests
-print("Testing CPU quantization:")
-test_cpu_quantization()
-
-if torch.cuda.is_available():
-    print("\nTesting GPU quantization:")
-    test_gpu_quantization()
-else:
-    print("\nGPU not available, skipping GPU test.")
-
-# Run performance tests
-print("\nMeasuring performance:")
-measure_time("cpu")
-if torch.cuda.is_available():
-    measure_time("cuda")

From d0b683fee9af2c039e9828d89214392f225388f2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 8 Oct 2024 21:49:34 +0000
Subject: [PATCH 5/9] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 thunder/transforms/quantization.py     |  6 +++---
 thunder/transforms/quantization_cpu.py | 14 ++++++++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py
index ff9e502554..c5ab06ea0b 100644
--- a/thunder/transforms/quantization.py
+++ b/thunder/transforms/quantization.py
@@ -58,11 +58,11 @@ def quantize_weight(self, w):
         if w.device.type == "meta":
             n = w.numel()
             output_shape = ((n + 1) // 2,)
-            blocksize=64
+            blocksize = 64
             blocks = n // blocksize
             blocks += 1 if n % blocksize > 0 else 0
             absmax = torch.zeros((blocks,), device=w.device, dtype=w.dtype)
-            quant_type="nf4"
+            quant_type = "nf4"
             code = get_4bit_type(quant_type, device=w.device)
 
             # Return only shape and dtype for meta tensors without calculation
@@ -78,7 +78,7 @@ def quantize_weight(self, w):
 
         if w.device.type == "cpu":
             return quantize_4bit_cpu(w, quant_type="nf4")
-        
+
         if w.device.type != "cuda":
             with torch.no_grad():
                 w_work = w.to("cuda")
diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
index 3b31f0c5ac..0e8e84837d 100644
--- a/thunder/transforms/quantization_cpu.py
+++ b/thunder/transforms/quantization_cpu.py
@@ -4,7 +4,7 @@
 
 The code for CPU quantization in this file has been adapted from a not-yet-merged
 multi-backend-refactor branch
-    
+
 MIT License:
     https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/LICENSE
 
@@ -58,11 +58,13 @@
 
 Tensor = torch.Tensor
 
+
 def _torch_version_prereq(major, minor):
     ver_major = int(torch.__version__.split(".")[0])
     ver_minor = int(torch.__version__.split(".")[1])
     return ver_major * 32 + ver_minor >= major * 32 + minor
 
+
 def _maybe_torch_compile(func):
     # torch.compile requires g++ and pytorch >= 2.0
     if gxx_available and _torch_version_prereq(2, 0):
@@ -73,6 +75,7 @@ def _maybe_torch_compile(func):
         return torch.compile(func, dynamic=True, options=options)
     return func
 
+
 NF4_QUANT_TABLE = [
     -1.0 - 1e-2,  # 0b0000
     -0.8480964004993439,  # 0b0001
@@ -103,6 +106,7 @@ def _maybe_torch_compile(func):
     0.8333333: 3,  # 0b0011
 }
 
+
 def assert_on_cpu(tensors):
     on_cpu = True
     for t in tensors:
@@ -116,21 +120,23 @@ def assert_on_cpu(tensors):
         )
     return on_cpu
 
+
 def quantize_4bit_cpu(
     A: torch.Tensor,
-    absmax: Optional[torch.Tensor] = None,
-    out: Optional[torch.Tensor] = None,
+    absmax: torch.Tensor | None = None,
+    out: torch.Tensor | None = None,
     blocksize=64,
     compress_statistics=False,
     quant_type: Literal["fp4", "nf4"] = "fp4",
     quant_storage=torch.uint8,
-) -> Tuple[torch.Tensor, QuantState]:
+) -> tuple[torch.Tensor, QuantState]:
     if blocksize is None:
         blocksize = 64
     assert_on_cpu([A, absmax, out])
     assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
     return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
 
+
 @_maybe_torch_compile
 def quantize_4bit_impl(
     A: Tensor,

From 63fc6d4b893685a499495e60128e80b3fb3bc930 Mon Sep 17 00:00:00 2001
From: Tomasz Bawor <tomasz.bawor@Siriusxm.com>
Date: Sat, 12 Oct 2024 04:03:24 +0200
Subject: [PATCH 6/9] meta and cpu shape update

---
 thunder/transforms/quantization.py     | 2 +-
 thunder/transforms/quantization_cpu.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py
index c5ab06ea0b..234244c044 100644
--- a/thunder/transforms/quantization.py
+++ b/thunder/transforms/quantization.py
@@ -57,7 +57,7 @@ def __init__(self):
     def quantize_weight(self, w):
         if w.device.type == "meta":
             n = w.numel()
-            output_shape = ((n + 1) // 2,)
+            output_shape = ((n + 1) // 2, 1)
             blocksize = 64
             blocks = n // blocksize
             blocks += 1 if n % blocksize > 0 else 0
diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
index 0e8e84837d..b8df98a46f 100644
--- a/thunder/transforms/quantization_cpu.py
+++ b/thunder/transforms/quantization_cpu.py
@@ -185,7 +185,9 @@ def quantize_4bit_impl(
         absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype)
 
     if out is None:
-        out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
+        # change to 2D shape instead of unsqueeze(0) to be consistent with
+        # CUDA implementation in multi-backend-refactor branch
+        out = torch.zeros(((n + 1) // 2, 1), dtype=torch.uint8, device=A.device)
 
     rem = n % blocksize
     has_rem = rem > 0
@@ -230,4 +232,4 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
-    return out.unsqueeze(0), state
+    return out, state

From b4e7daee697d4529beaa75c4d69d686730e58956 Mon Sep 17 00:00:00 2001
From: Tomasz Bawor <tomasz.bawor@Siriusxm.com>
Date: Mon, 21 Oct 2024 12:20:05 +0200
Subject: [PATCH 7/9] META, CPU and CUDA consistency changes

---
 thunder/transforms/quantization.py     |  3 ++-
 thunder/transforms/quantization_cpu.py | 12 ++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py
index 234244c044..a90e8a3c51 100644
--- a/thunder/transforms/quantization.py
+++ b/thunder/transforms/quantization.py
@@ -61,7 +61,8 @@ def quantize_weight(self, w):
             blocksize = 64
             blocks = n // blocksize
             blocks += 1 if n % blocksize > 0 else 0
-            absmax = torch.zeros((blocks,), device=w.device, dtype=w.dtype)
+            # cuda absmax dtype is torch.float32 instead of dtype=A.dtype
+            absmax = torch.zeros((blocks,), device=w.device, dtype=torch.float32)
             quant_type = "nf4"
             code = get_4bit_type(quant_type, device=w.device)
 
diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
index b8df98a46f..a5f1fabfb8 100644
--- a/thunder/transforms/quantization_cpu.py
+++ b/thunder/transforms/quantization_cpu.py
@@ -182,7 +182,8 @@ def quantize_4bit_impl(
     blocks += 1 if n % blocksize > 0 else 0
 
     if absmax is None:
-        absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype)
+        # cuda absmax dtype is torch.float32 instead of dtype=A.dtype
+        absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
 
     if out is None:
         # change to 2D shape instead of unsqueeze(0) to be consistent with
@@ -216,7 +217,14 @@ def quantize_4bit_impl(
         out_uint8 += sign.to(torch.uint8) * 8
     if out_uint8.size(-1) % 2:
         out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
-    out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+
+    # Perform the bitwise operations
+    result = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+
+    # Reshape the result to a 2D tensor with shape [N, 1]
+    # CUDA out is 2D tensor
+    out[:] = result.view(-1, 1)
+    # out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
 
     code = get_4bit_type(quant_type, device=A.device)
 

From 4b15a72cd41bec180b39a4215551b124c8950713 Mon Sep 17 00:00:00 2001
From: Tomasz Bawor <tomasz.bawor@Siriusxm.com>
Date: Sun, 3 Nov 2024 20:06:25 +0100
Subject: [PATCH 8/9] bitsandbytes import update

---
 thunder/transforms/quantization.py     |  5 ++---
 thunder/transforms/quantization_cpu.py | 11 ++++-------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/thunder/transforms/quantization.py b/thunder/transforms/quantization.py
index a90e8a3c51..647945fa5c 100644
--- a/thunder/transforms/quantization.py
+++ b/thunder/transforms/quantization.py
@@ -15,7 +15,6 @@
 )
 
 from .quantization_cpu import quantize_4bit_cpu
-from bitsandbytes.functional import QuantState, get_4bit_type
 
 bitsandbytes_executor = None
 
@@ -64,10 +63,10 @@ def quantize_weight(self, w):
             # cuda absmax dtype is torch.float32 instead of dtype=A.dtype
             absmax = torch.zeros((blocks,), device=w.device, dtype=torch.float32)
             quant_type = "nf4"
-            code = get_4bit_type(quant_type, device=w.device)
+            code = bitsandbytes.functional.get_4bit_type(quant_type, device=w.device)
 
             # Return only shape and dtype for meta tensors without calculation
-            state = QuantState(
+            state = bitsandbytes.functional.QuantState(
                 absmax=absmax,
                 shape=w.shape,
                 dtype=w.dtype,
diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
index a5f1fabfb8..33f0b0b671 100644
--- a/thunder/transforms/quantization_cpu.py
+++ b/thunder/transforms/quantization_cpu.py
@@ -34,10 +34,7 @@
 import warnings
 import torch
 
-from bitsandbytes.functional import (
-    QuantState,
-    get_4bit_type,
-)
+import bitsandbytes
 
 try:
     # to support Intel CPU/GPU (XPU) backend
@@ -129,7 +126,7 @@ def quantize_4bit_cpu(
     compress_statistics=False,
     quant_type: Literal["fp4", "nf4"] = "fp4",
     quant_storage=torch.uint8,
-) -> tuple[torch.Tensor, QuantState]:
+) -> tuple[torch.Tensor, bitsandbytes.functional.QuantState]:
     if blocksize is None:
         blocksize = 64
     assert_on_cpu([A, absmax, out])
@@ -226,12 +223,12 @@ def quantize_4bit_impl(
     out[:] = result.view(-1, 1)
     # out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
 
-    code = get_4bit_type(quant_type, device=A.device)
+    code = bitsandbytes.functional.get_4bit_type(quant_type, device=A.device)
 
     if compress_statistics:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
     else:
-        state = QuantState(
+        state = bitsandbytes.functional.QuantState(
             absmax=absmax,
             shape=input_shape,
             dtype=A.dtype,

From dbca6e917a59c9ae2bc4f487f74c62a2a3254f11 Mon Sep 17 00:00:00 2001
From: Tomasz Bawor <tomasz.bawor@Siriusxm.com>
Date: Sun, 3 Nov 2024 20:35:03 +0100
Subject: [PATCH 9/9] update

---
 thunder/transforms/quantization_cpu.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/thunder/transforms/quantization_cpu.py b/thunder/transforms/quantization_cpu.py
index 33f0b0b671..a5f1fabfb8 100644
--- a/thunder/transforms/quantization_cpu.py
+++ b/thunder/transforms/quantization_cpu.py
@@ -34,7 +34,10 @@
 import warnings
 import torch
 
-import bitsandbytes
+from bitsandbytes.functional import (
+    QuantState,
+    get_4bit_type,
+)
 
 try:
     # to support Intel CPU/GPU (XPU) backend
@@ -126,7 +129,7 @@ def quantize_4bit_cpu(
     compress_statistics=False,
     quant_type: Literal["fp4", "nf4"] = "fp4",
     quant_storage=torch.uint8,
-) -> tuple[torch.Tensor, bitsandbytes.functional.QuantState]:
+) -> tuple[torch.Tensor, QuantState]:
     if blocksize is None:
         blocksize = 64
     assert_on_cpu([A, absmax, out])
@@ -223,12 +226,12 @@ def quantize_4bit_impl(
     out[:] = result.view(-1, 1)
     # out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
 
-    code = bitsandbytes.functional.get_4bit_type(quant_type, device=A.device)
+    code = get_4bit_type(quant_type, device=A.device)
 
     if compress_statistics:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
     else:
-        state = bitsandbytes.functional.QuantState(
+        state = QuantState(
             absmax=absmax,
             shape=input_shape,
             dtype=A.dtype,