forked from openvinotoolkit/openvino
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[GPU] KV-cache compression support (openvinotoolkit#27114)
### Details: This PR enables KV-cache compression support Currently, it supports only combinations of the following configurations: * Data types: INT8_SYM / INT8_ASYM * Modes: per-token (quantization of `num_heads * head_size` in a single group) / per-token-per-head (quantization of each `head_size` group for each head per token) ### Tickets: - *ticket-id*
- Loading branch information
Showing
69 changed files
with
3,447 additions
and
343 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
src/plugins/intel_gpu/include/intel_gpu/op/kv_cache_compressed.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
// Copyright (C) 2023 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "intel_gpu/op/kv_cache.hpp" | ||
#include "ov_ops/dynamic_quantize.hpp" | ||
|
||
namespace ov { | ||
namespace intel_gpu { | ||
namespace op { | ||
|
||
/// \brief Operator that implements Key-Values cache subgraph for large language models. | ||
/// This operation updates data of the corresponding Variable | ||
class KVCacheCompressed : public ov::intel_gpu::op::KVCache { | ||
public: | ||
OPENVINO_OP("KVCacheCompressed", "gpu_opset"); | ||
|
||
using QuantizationAttrs = ov::op::internal::DynamicQuantize::Attributes; | ||
|
||
KVCacheCompressed() = default; | ||
|
||
KVCacheCompressed(const OutputVector& inputs, | ||
const std::shared_ptr<ov::op::util::Variable>& past_values, | ||
int64_t concat_axis, | ||
int64_t gather_axis, | ||
const QuantizationAttrs& quantization_attrs, | ||
const ov::element::Type output_type = ov::element::undefined); | ||
|
||
void validate_and_infer_types() override; | ||
|
||
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override; | ||
|
||
bool get_kv_compressed() const { return m_compressed; } | ||
bool get_combine_scales_and_zp() const { | ||
return m_quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && | ||
m_quantization_attrs.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar; | ||
} | ||
|
||
QuantizationAttrs get_quantization_attrs() const { return m_quantization_attrs; } | ||
void set_quantization_attrs(QuantizationAttrs attrs) { m_quantization_attrs = std::move(attrs); } | ||
|
||
std::vector<uint64_t> get_scales_zp_output_order() const { return m_quantization_attrs.scales_zp_output_order; } | ||
|
||
private: | ||
bool m_compressed; | ||
QuantizationAttrs m_quantization_attrs = {}; | ||
}; | ||
|
||
std::vector<ov::PartialShape> shape_infer(const KVCacheCompressed* op, | ||
const std::vector<ov::PartialShape>& input_shapes); | ||
|
||
} // namespace op | ||
} // namespace intel_gpu | ||
} // namespace ov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.