From 666de99fa6f33ae777d98822d4be7121eab2388a Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 13 Jun 2024 21:41:37 +0000
Subject: [PATCH 1/7] add channel support for per token

---
 .../quantization/compressed_tensors/compressed_tensors.py | 8 +++-----
 .../schemes/compressed_tensors_w8a8_dynamictoken.py       | 5 ++++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c7f04784591b2..96dc752ac6350 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -88,10 +88,8 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
     def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        is_token_tensor = (weight_quant.strategy
-                           == QuantizationStrategy.TENSOR.value) and (
-                               input_quant.strategy
-                               == QuantizationStrategy.TOKEN.value)
+        weight_strategy = weight_quant.strategy == QuantizationStrategy.TENSOR.value or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        is_token_tensor = weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
@@ -118,7 +116,7 @@ def _get_schema(self, weight_quant: BaseModel,
             return CompressedTensorsW8A8StaticTensor()
 
         if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8DynamicToken()
+            return CompressedTensorsW8A8DynamicToken(strategy=weight_quant.strategy)
 
         raise NotImplementedError("Scheme not supported.")
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index 9bb7bf4470872..eed57c682ec1d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -12,6 +12,8 @@
 
 
 class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
+    def __init__(self, strategy: str):
+        self.strategy = strategy
 
     def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
@@ -45,8 +47,9 @@ def create_weights(self, layer: torch.nn.Module,
         # CompressedTensorsW8A8StaticTensor::create_weights for further
         # information.
         is_tensor_partitioned = len(output_partition_sizes) != 1
+        # TODO: if strategy: channel this should always be weight_scale_dim
         weight_scale_dim = sum(
-            output_partition_sizes) if is_tensor_partitioned else 1
+            output_partition_sizes) if (is_tensor_partitioned or self.strategy == "CHANNEL") else 1
 
         weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
                                       requires_grad=False)

From f3877468c9e56fcfa01fdbe93c3a0e79b3f17922 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 14 Jun 2024 02:37:57 +0000
Subject: [PATCH 2/7] update

---
 vllm/model_executor/layers/linear.py          |  2 ++
 .../compressed_tensors_w8a8_dynamictoken.py   | 33 ++++++++++++++-----
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f5b6bdd9f7fd7..7cffd68e90528 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -475,6 +475,7 @@ def weight_loader(self,
             if len(loaded_weight.shape) == 0:
                 loaded_weight = loaded_weight.reshape(1)
 
+        print(param_data.shape, loaded_weight.shape)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -786,6 +787,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
+        print(param_data.shape, loaded_weight.shape)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index eed57c682ec1d..4f43b00028e02 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -49,14 +49,20 @@ def create_weights(self, layer: torch.nn.Module,
         is_tensor_partitioned = len(output_partition_sizes) != 1
         # TODO: if strategy: channel this should always be weight_scale_dim
         weight_scale_dim = sum(
-            output_partition_sizes) if (is_tensor_partitioned or self.strategy == "CHANNEL") else 1
+            output_partition_sizes) if (is_tensor_partitioned or self.strategy.value == "channel") else 1
 
         weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
                                       requires_grad=False)
 
-        weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             dtype=torch.float32),
-                                 requires_grad=False)
+        # Can we add the extra dim for the per tensor case so the shapes are the same?
+        if self.strategy.value == "channel":
+            weight_scale = Parameter(torch.empty(weight_scale_dim, 1,
+                                                dtype=torch.float32),
+                                    requires_grad=False)
+        else:
+            weight_scale = Parameter(torch.empty(weight_scale_dim, 
+                                                dtype=torch.float32),
+                                        requires_grad=False)
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
@@ -70,11 +76,20 @@ def create_weights(self, layer: torch.nn.Module,
 
         layer.register_parameter("weight_scale", weight_scale)
         set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
-        set_weight_attrs(
-            weight_scale, {
-                "shard_splitter": self.scales_shard_splitter,
-                "logical_widths": output_partition_sizes
-            })
+
+        if self.strategy.value == "channel":
+            set_weight_attrs(
+                weight_scale, {
+                    "output_dim": 0,
+                })
+
+        # Shouldn't need the shard_splitter if using channel-wise. Confirm this all loads
+        if self.strategy.value != "channel":
+            set_weight_attrs(
+                weight_scale, {
+                    "logical_widths": output_partition_sizes,
+                    "shard_splitter": self.scales_shard_splitter,
+                })
 
         layer.register_parameter("weight_zero_point", weight_zero_point)
         set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader})

From fd43792873923beba69ed08055dc7201b10b8270 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 14 Jun 2024 13:50:10 +0000
Subject: [PATCH 3/7] format; cleanup

---
 vllm/model_executor/layers/linear.py          | 16 +-------
 .../compressed_tensors/compressed_tensors.py  | 10 +++--
 .../compressed_tensors_w8a8_dynamictoken.py   | 39 +++++++++----------
 3 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 7cffd68e90528..8518485572539 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -468,14 +468,6 @@ def weight_loader(self,
                     "MergedColumnParallelLinear, assume the weight is "
                     "the same for all partitions.")
 
-        if fp8_scales_shard_indexer is None:
-            if len(param_data.shape) == 0:
-                param_data = param_data.reshape(1)
-
-            if len(loaded_weight.shape) == 0:
-                loaded_weight = loaded_weight.reshape(1)
-
-        print(param_data.shape, loaded_weight.shape)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -687,12 +679,6 @@ def weight_loader(self,
                     "QKVParallelLinear, assume the weight is the same "
                     "for all partitions.")
 
-        if len(param_data.shape) == 0:
-            param_data = param_data.reshape(1)
-
-        if len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
-
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -785,7 +771,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
                                                                  shard_id=0)
 
         if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
+            loaded_weight = loaded_weight.reshape(1, 1)
 
         print(param_data.shape, loaded_weight.shape)
         assert param_data.shape == loaded_weight.shape
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 96dc752ac6350..1214386afc831 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -88,8 +88,11 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
     def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
-        weight_strategy = weight_quant.strategy == QuantizationStrategy.TENSOR.value or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
-        is_token_tensor = weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
+        is_token_tensor = (weight_strategy and input_quant.strategy
+                           == QuantizationStrategy.TOKEN.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
@@ -116,7 +119,8 @@ def _get_schema(self, weight_quant: BaseModel,
             return CompressedTensorsW8A8StaticTensor()
 
         if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8DynamicToken(strategy=weight_quant.strategy)
+            return CompressedTensorsW8A8DynamicToken(
+                strategy=weight_quant.strategy)
 
         raise NotImplementedError("Scheme not supported.")
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index 4f43b00028e02..a8d7b89079b21 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -6,12 +6,15 @@
 from vllm import _custom_ops as custom_ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
 from vllm.model_executor.utils import set_weight_attrs
 
 __all__ = ["CompressedTensorsW8A8DynamicToken"]
 
 
 class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
+
     def __init__(self, strategy: str):
         self.strategy = strategy
 
@@ -33,6 +36,9 @@ def scales_shard_splitter(
         size = logical_widths[shard_id]
         # update loaded weight with copies for broadcast.
         loaded_weight = loaded_weight.repeat(size)
+        # parameter defined for scale is 2D; expand
+        if len(loaded_weight.shape) == 1:
+            loaded_weight = torch.unsqueeze(loaded_weight, -1)
         return param[offset:offset + size], loaded_weight
 
     def create_weights(self, layer: torch.nn.Module,
@@ -47,22 +53,18 @@ def create_weights(self, layer: torch.nn.Module,
         # CompressedTensorsW8A8StaticTensor::create_weights for further
         # information.
         is_tensor_partitioned = len(output_partition_sizes) != 1
-        # TODO: if strategy: channel this should always be weight_scale_dim
-        weight_scale_dim = sum(
-            output_partition_sizes) if (is_tensor_partitioned or self.strategy.value == "channel") else 1
+
+        weight_scale_dim = sum(output_partition_sizes) if (
+            is_tensor_partitioned
+            or self.strategy == QuantizationStrategy.CHANNEL) else 1
 
         weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
                                       requires_grad=False)
 
-        # Can we add the extra dim for the per tensor case so the shapes are the same?
-        if self.strategy.value == "channel":
-            weight_scale = Parameter(torch.empty(weight_scale_dim, 1,
-                                                dtype=torch.float32),
-                                    requires_grad=False)
-        else:
-            weight_scale = Parameter(torch.empty(weight_scale_dim, 
-                                                dtype=torch.float32),
-                                        requires_grad=False)
+        weight_scale = Parameter(torch.empty(weight_scale_dim,
+                                             1,
+                                             dtype=torch.float32),
+                                 requires_grad=False)
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
@@ -77,14 +79,11 @@ def create_weights(self, layer: torch.nn.Module,
         layer.register_parameter("weight_scale", weight_scale)
         set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
 
-        if self.strategy.value == "channel":
-            set_weight_attrs(
-                weight_scale, {
-                    "output_dim": 0,
-                })
-
-        # Shouldn't need the shard_splitter if using channel-wise. Confirm this all loads
-        if self.strategy.value != "channel":
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            set_weight_attrs(weight_scale, {
+                "output_dim": 0,
+            })
+        else:
             set_weight_attrs(
                 weight_scale, {
                     "logical_widths": output_partition_sizes,

From d804d985d1895f97746612031e08a0c64d64a280 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 14 Jun 2024 13:52:25 +0000
Subject: [PATCH 4/7] comments

---
 .../schemes/compressed_tensors_w8a8_dynamictoken.py          | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index a8d7b89079b21..1743c5a599378 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -53,7 +53,8 @@ def create_weights(self, layer: torch.nn.Module,
         # CompressedTensorsW8A8StaticTensor::create_weights for further
         # information.
         is_tensor_partitioned = len(output_partition_sizes) != 1
-
+        # when doing channel-wise quantization, number of scales
+        # is equal to output_dim
         weight_scale_dim = sum(output_partition_sizes) if (
             is_tensor_partitioned
             or self.strategy == QuantizationStrategy.CHANNEL) else 1
@@ -79,6 +80,8 @@ def create_weights(self, layer: torch.nn.Module,
         layer.register_parameter("weight_scale", weight_scale)
         set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
 
+        # Don't need a shard_splitter for channel-wise quantization
+        # Use the default loading method
         if self.strategy == QuantizationStrategy.CHANNEL:
             set_weight_attrs(weight_scale, {
                 "output_dim": 0,

From 80de429ab903bf098fad08e6dee8a0ad716ab010 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 14 Jun 2024 14:01:29 +0000
Subject: [PATCH 5/7] update

---
 vllm/model_executor/layers/linear.py                        | 1 -
 .../quantization/compressed_tensors/compressed_tensors.py   | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8518485572539..dbbd52dccd61d 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -773,7 +773,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1, 1)
 
-        print(param_data.shape, loaded_weight.shape)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 1214386afc831..3c95522fe00fa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -91,12 +91,12 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
             or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
-        is_token_tensor = (weight_strategy and input_quant.strategy
-                           == QuantizationStrategy.TOKEN.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
         is_dynamic = not weight_quant.dynamic and input_quant.dynamic
 
-        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
+        return is_8_bits and is_token and is_symmetric and is_dynamic
 
     def _is_w4a16(self, weight_quant: BaseModel,
                   input_quant: BaseModel) -> bool:

From 5d59f7ab9d2d7ffc48b7aaa7db50771369593102 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 17 Jun 2024 18:50:02 +0000
Subject: [PATCH 6/7] fix bug; update test

---
 tests/quantization/test_compressed_tensors.py       | 13 +++++++++----
 vllm/model_executor/layers/linear.py                |  2 +-
 .../compressed_tensors/compressed_tensors.py        |  4 ++--
 .../schemes/compressed_tensors_w8a8_dynamictoken.py | 10 +++++-----
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 611c6b8b7fb9a..b78081155e2ba 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -14,7 +14,7 @@
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -43,15 +43,19 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
 
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
         sampling_params = SamplingParams()
         output = llm.generate("Hello world!", sampling_params=sampling_params)
         assert output
 
 
-def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
+@pytest.mark.parametrize("model_args", [
+    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
+    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
+])
+def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
+    model_path, strategy = model_args
     with vllm_runner(model_path, dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
@@ -60,6 +64,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
 
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
         assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert qkv_proj.scheme.strategy == strategy
         assert qkv_proj.weight.dtype is torch.int8
 
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index d29ab35efa50f..45f805547b414 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -771,7 +771,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
                                                                  shard_id=0)
 
         if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1, 1)
+            loaded_weight = loaded_weight.reshape(1)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 7a85db0735b9d..347a052a663da 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -134,8 +134,8 @@ def _get_schema(self, weight_quant: BaseModel,
                 return CompressedTensorsW8A8StaticTensor()
 
             if self._is_dynamic_token_w8a8(weight_quant, input_quant):
-              return CompressedTensorsW8A8DynamicToken(
-                  strategy=weight_quant.strategy)
+                return CompressedTensorsW8A8DynamicToken(
+                    strategy=weight_quant.strategy)
 
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index 90241d66954e2..d2b998d0569a7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -37,8 +37,6 @@ def scales_shard_splitter(
         # update loaded weight with copies for broadcast.
         loaded_weight = loaded_weight.repeat(size)
         # parameter defined for scale is 2D; expand
-        if len(loaded_weight.shape) == 1:
-            loaded_weight = torch.unsqueeze(loaded_weight, -1)
         return param[offset:offset + size], loaded_weight
 
     def create_weights(self, layer: torch.nn.Module,
@@ -59,9 +57,11 @@ def create_weights(self, layer: torch.nn.Module,
             is_tensor_partitioned
             or self.strategy == QuantizationStrategy.CHANNEL) else 1
 
-        weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             1,
-                                             dtype=torch.float32),
+        shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, )
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            shape = (weight_scale_dim, 1)
+
+        weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32),
                                  requires_grad=False)
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),

From 3319697efb268c266b859145cd8266fccadaf0f9 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 17 Jun 2024 18:52:07 +0000
Subject: [PATCH 7/7] remove comment

---
 .../schemes/compressed_tensors_w8a8_dynamictoken.py              | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
index d2b998d0569a7..37610c9c2898b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -36,7 +36,6 @@ def scales_shard_splitter(
         size = logical_widths[shard_id]
         # update loaded weight with copies for broadcast.
         loaded_weight = loaded_weight.repeat(size)
-        # parameter defined for scale is 2D; expand
         return param[offset:offset + size], loaded_weight
 
     def create_weights(self, layer: torch.nn.Module,