From 666de99fa6f33ae777d98822d4be7121eab2388a Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 13 Jun 2024 21:41:37 +0000 Subject: [PATCH 1/7] add channel support for per token --- .../quantization/compressed_tensors/compressed_tensors.py | 8 +++----- .../schemes/compressed_tensors_w8a8_dynamictoken.py | 5 ++++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index c7f04784591b2..96dc752ac6350 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -88,10 +88,8 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel, def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - is_token_tensor = (weight_quant.strategy - == QuantizationStrategy.TENSOR.value) and ( - input_quant.strategy - == QuantizationStrategy.TOKEN.value) + weight_strategy = weight_quant.strategy == QuantizationStrategy.TENSOR.value or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + is_token_tensor = weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value is_symmetric = weight_quant.symmetric and input_quant.symmetric is_dynamic = not weight_quant.dynamic and input_quant.dynamic @@ -118,7 +116,7 @@ def _get_schema(self, weight_quant: BaseModel, return CompressedTensorsW8A8StaticTensor() if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8DynamicToken() + return CompressedTensorsW8A8DynamicToken(strategy=weight_quant.strategy) raise NotImplementedError("Scheme not supported.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index 9bb7bf4470872..eed57c682ec1d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -12,6 +12,8 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): + def __init__(self, strategy: str): + self.strategy = strategy def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: if isinstance(shard_id, int): @@ -45,8 +47,9 @@ def create_weights(self, layer: torch.nn.Module, # CompressedTensorsW8A8StaticTensor::create_weights for further # information. is_tensor_partitioned = len(output_partition_sizes) != 1 + # TODO: if strategy: channel this should always be weight_scale_dim weight_scale_dim = sum( - output_partition_sizes) if is_tensor_partitioned else 1 + output_partition_sizes) if (is_tensor_partitioned or self.strategy == "CHANNEL") else 1 weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), requires_grad=False) From f3877468c9e56fcfa01fdbe93c3a0e79b3f17922 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 14 Jun 2024 02:37:57 +0000 Subject: [PATCH 2/7] update --- vllm/model_executor/layers/linear.py | 2 ++ .../compressed_tensors_w8a8_dynamictoken.py | 33 ++++++++++++++----- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f5b6bdd9f7fd7..7cffd68e90528 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -475,6 +475,7 @@ def weight_loader(self, if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) + print(param_data.shape, loaded_weight.shape) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -786,6 +787,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) + print(param_data.shape, loaded_weight.shape) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index eed57c682ec1d..4f43b00028e02 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -49,14 +49,20 @@ def create_weights(self, layer: torch.nn.Module, is_tensor_partitioned = len(output_partition_sizes) != 1 # TODO: if strategy: channel this should always be weight_scale_dim weight_scale_dim = sum( - output_partition_sizes) if (is_tensor_partitioned or self.strategy == "CHANNEL") else 1 + output_partition_sizes) if (is_tensor_partitioned or self.strategy.value == "channel") else 1 weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), requires_grad=False) - weight_scale = Parameter(torch.empty(weight_scale_dim, - dtype=torch.float32), - requires_grad=False) + # Can we add the extra dim for the per tensor case so the shapes are the same? + if self.strategy.value == "channel": + weight_scale = Parameter(torch.empty(weight_scale_dim, 1, + dtype=torch.float32), + requires_grad=False) + else: + weight_scale = Parameter(torch.empty(weight_scale_dim, + dtype=torch.float32), + requires_grad=False) weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, @@ -70,11 +76,20 @@ def create_weights(self, layer: torch.nn.Module, layer.register_parameter("weight_scale", weight_scale) set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) - set_weight_attrs( - weight_scale, { - "shard_splitter": self.scales_shard_splitter, - "logical_widths": output_partition_sizes - }) + + if self.strategy.value == "channel": + set_weight_attrs( + weight_scale, { + "output_dim": 0, + }) + + # Shouldn't need the shard_splitter if using channel-wise. Confirm this all loads + if self.strategy.value != "channel": + set_weight_attrs( + weight_scale, { + "logical_widths": output_partition_sizes, + "shard_splitter": self.scales_shard_splitter, + }) layer.register_parameter("weight_zero_point", weight_zero_point) set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader}) From fd43792873923beba69ed08055dc7201b10b8270 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 14 Jun 2024 13:50:10 +0000 Subject: [PATCH 3/7] format; cleanup --- vllm/model_executor/layers/linear.py | 16 +------- .../compressed_tensors/compressed_tensors.py | 10 +++-- .../compressed_tensors_w8a8_dynamictoken.py | 39 +++++++++---------- 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 7cffd68e90528..8518485572539 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -468,14 +468,6 @@ def weight_loader(self, "MergedColumnParallelLinear, assume the weight is " "the same for all partitions.") - if fp8_scales_shard_indexer is None: - if len(param_data.shape) == 0: - param_data = param_data.reshape(1) - - if len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) - - print(param_data.shape, loaded_weight.shape) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -687,12 +679,6 @@ def weight_loader(self, "QKVParallelLinear, assume the weight is the same " "for all partitions.") - if len(param_data.shape) == 0: - param_data = param_data.reshape(1) - - if len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) @@ -785,7 +771,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): shard_id=0) if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1) + loaded_weight = loaded_weight.reshape(1, 1) print(param_data.shape, loaded_weight.shape) assert param_data.shape == loaded_weight.shape diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 96dc752ac6350..1214386afc831 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -88,8 +88,11 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel, def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 - weight_strategy = weight_quant.strategy == QuantizationStrategy.TENSOR.value or weight_quant.strategy == QuantizationStrategy.CHANNEL.value - is_token_tensor = weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.TENSOR.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) + is_token_tensor = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TOKEN.value) is_symmetric = weight_quant.symmetric and input_quant.symmetric is_dynamic = not weight_quant.dynamic and input_quant.dynamic @@ -116,7 +119,8 @@ def _get_schema(self, weight_quant: BaseModel, return CompressedTensorsW8A8StaticTensor() if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8DynamicToken(strategy=weight_quant.strategy) + return CompressedTensorsW8A8DynamicToken( + strategy=weight_quant.strategy) raise NotImplementedError("Scheme not supported.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index 4f43b00028e02..a8d7b89079b21 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -6,12 +6,15 @@ from vllm import _custom_ops as custom_ops from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) from vllm.model_executor.utils import set_weight_attrs __all__ = ["CompressedTensorsW8A8DynamicToken"] class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): + def __init__(self, strategy: str): self.strategy = strategy @@ -33,6 +36,9 @@ def scales_shard_splitter( size = logical_widths[shard_id] # update loaded weight with copies for broadcast. loaded_weight = loaded_weight.repeat(size) + # parameter defined for scale is 2D; expand + if len(loaded_weight.shape) == 1: + loaded_weight = torch.unsqueeze(loaded_weight, -1) return param[offset:offset + size], loaded_weight def create_weights(self, layer: torch.nn.Module, @@ -47,22 +53,18 @@ def create_weights(self, layer: torch.nn.Module, # CompressedTensorsW8A8StaticTensor::create_weights for further # information. is_tensor_partitioned = len(output_partition_sizes) != 1 - # TODO: if strategy: channel this should always be weight_scale_dim - weight_scale_dim = sum( - output_partition_sizes) if (is_tensor_partitioned or self.strategy.value == "channel") else 1 + + weight_scale_dim = sum(output_partition_sizes) if ( + is_tensor_partitioned + or self.strategy == QuantizationStrategy.CHANNEL) else 1 weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8), requires_grad=False) - # Can we add the extra dim for the per tensor case so the shapes are the same? - if self.strategy.value == "channel": - weight_scale = Parameter(torch.empty(weight_scale_dim, 1, - dtype=torch.float32), - requires_grad=False) - else: - weight_scale = Parameter(torch.empty(weight_scale_dim, - dtype=torch.float32), - requires_grad=False) + weight_scale = Parameter(torch.empty(weight_scale_dim, + 1, + dtype=torch.float32), + requires_grad=False) weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, @@ -77,14 +79,11 @@ def create_weights(self, layer: torch.nn.Module, layer.register_parameter("weight_scale", weight_scale) set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) - if self.strategy.value == "channel": - set_weight_attrs( - weight_scale, { - "output_dim": 0, - }) - - # Shouldn't need the shard_splitter if using channel-wise. Confirm this all loads - if self.strategy.value != "channel": + if self.strategy == QuantizationStrategy.CHANNEL: + set_weight_attrs(weight_scale, { + "output_dim": 0, + }) + else: set_weight_attrs( weight_scale, { "logical_widths": output_partition_sizes, From d804d985d1895f97746612031e08a0c64d64a280 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 14 Jun 2024 13:52:25 +0000 Subject: [PATCH 4/7] comments --- .../schemes/compressed_tensors_w8a8_dynamictoken.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index a8d7b89079b21..1743c5a599378 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -53,7 +53,8 @@ def create_weights(self, layer: torch.nn.Module, # CompressedTensorsW8A8StaticTensor::create_weights for further # information. is_tensor_partitioned = len(output_partition_sizes) != 1 - + # when doing channel-wise quantization, number of scales + # is equal to output_dim weight_scale_dim = sum(output_partition_sizes) if ( is_tensor_partitioned or self.strategy == QuantizationStrategy.CHANNEL) else 1 @@ -79,6 +80,8 @@ def create_weights(self, layer: torch.nn.Module, layer.register_parameter("weight_scale", weight_scale) set_weight_attrs(weight_scale, {"weight_loader": weight_loader}) + # Don't need a shard_splitter for channel-wise quantization + # Use the default loading method if self.strategy == QuantizationStrategy.CHANNEL: set_weight_attrs(weight_scale, { "output_dim": 0, From 80de429ab903bf098fad08e6dee8a0ad716ab010 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 14 Jun 2024 14:01:29 +0000 Subject: [PATCH 5/7] update --- vllm/model_executor/layers/linear.py | 1 - .../quantization/compressed_tensors/compressed_tensors.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8518485572539..dbbd52dccd61d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -773,7 +773,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1, 1) - print(param_data.shape, loaded_weight.shape) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 1214386afc831..3c95522fe00fa 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -91,12 +91,12 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, weight_strategy = ( weight_quant.strategy == QuantizationStrategy.TENSOR.value or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) - is_token_tensor = (weight_strategy and input_quant.strategy - == QuantizationStrategy.TOKEN.value) + is_token = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TOKEN.value) is_symmetric = weight_quant.symmetric and input_quant.symmetric is_dynamic = not weight_quant.dynamic and input_quant.dynamic - return is_8_bits and is_token_tensor and is_symmetric and is_dynamic + return is_8_bits and is_token and is_symmetric and is_dynamic def _is_w4a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: From 5d59f7ab9d2d7ffc48b7aaa7db50771369593102 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 17 Jun 2024 18:50:02 +0000 Subject: [PATCH 6/7] fix bug; update test --- tests/quantization/test_compressed_tensors.py | 13 +++++++++---- vllm/model_executor/layers/linear.py | 2 +- .../compressed_tensors/compressed_tensors.py | 4 ++-- .../schemes/compressed_tensors_w8a8_dynamictoken.py | 10 +++++----- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 611c6b8b7fb9a..b78081155e2ba 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -14,7 +14,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2" + model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" with vllm_runner(model_path, enforce_eager=True) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -43,15 +43,19 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner): def test_compressed_tensors_no_enforce_eager(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2" + model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" with vllm_runner(model_path) as llm: sampling_params = SamplingParams() output = llm.generate("Hello world!", sampling_params=sampling_params) assert output -def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): - model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2" +@pytest.mark.parametrize("model_args", [ + ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), + ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"), +]) +def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): + model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -60,6 +64,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) + assert qkv_proj.scheme.strategy == strategy assert qkv_proj.weight.dtype is torch.int8 diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index d29ab35efa50f..45f805547b414 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -771,7 +771,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): shard_id=0) if fp8_scales_shard_indexer is None and len(loaded_weight.shape) == 0: - loaded_weight = loaded_weight.reshape(1, 1) + loaded_weight = loaded_weight.reshape(1) assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 7a85db0735b9d..347a052a663da 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -134,8 +134,8 @@ def _get_schema(self, weight_quant: BaseModel, return CompressedTensorsW8A8StaticTensor() if self._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8DynamicToken( - strategy=weight_quant.strategy) + return CompressedTensorsW8A8DynamicToken( + strategy=weight_quant.strategy) raise NotImplementedError( "No compressed-tensors compatible scheme was found.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index 90241d66954e2..d2b998d0569a7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -37,8 +37,6 @@ def scales_shard_splitter( # update loaded weight with copies for broadcast. loaded_weight = loaded_weight.repeat(size) # parameter defined for scale is 2D; expand - if len(loaded_weight.shape) == 1: - loaded_weight = torch.unsqueeze(loaded_weight, -1) return param[offset:offset + size], loaded_weight def create_weights(self, layer: torch.nn.Module, @@ -59,9 +57,11 @@ def create_weights(self, layer: torch.nn.Module, is_tensor_partitioned or self.strategy == QuantizationStrategy.CHANNEL) else 1 - weight_scale = Parameter(torch.empty(weight_scale_dim, - 1, - dtype=torch.float32), + shape: Union[Tuple[int], Tuple[int, int]] = (weight_scale_dim, ) + if self.strategy == QuantizationStrategy.CHANNEL: + shape = (weight_scale_dim, 1) + + weight_scale = Parameter(torch.empty(*shape, dtype=torch.float32), requires_grad=False) weight = Parameter(torch.empty(sum(output_partition_sizes), From 3319697efb268c266b859145cd8266fccadaf0f9 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 17 Jun 2024 18:52:07 +0000 Subject: [PATCH 7/7] remove comment --- .../schemes/compressed_tensors_w8a8_dynamictoken.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py index d2b998d0569a7..37610c9c2898b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py @@ -36,7 +36,6 @@ def scales_shard_splitter( size = logical_widths[shard_id] # update loaded weight with copies for broadcast. loaded_weight = loaded_weight.repeat(size) - # parameter defined for scale is 2D; expand return param[offset:offset + size], loaded_weight def create_weights(self, layer: torch.nn.Module,