vllm-project · AniZpZ · Sep 19, 2023 · Sep 19, 2023 · Sep 20, 2023 · Sep 27, 2023
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -25,6 +25,7 @@ def main(args: argparse.Namespace):
         dtype=args.dtype,
         enforce_eager=args.enforce_eager,
         kv_cache_dtype=args.kv_cache_dtype,
+        kv_quant_params_path=args.kv_quant_params_path,
         device=args.device,
         ray_workers_use_nsight=args.ray_workers_use_nsight,
     )
@@ -126,10 +127,16 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=['auto', 'fp8_e5m2'],
+        choices=['auto', 'fp8_e5m2', 'int8'],
         default='auto',
         help=
         'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument(
+        "--kv-quant-params-path",
+        type=str,
+        default=None,
+        help='Path to scales and zero points of kv cache quantizaiton '
+        'when kv cache dtype is int8.')
     parser.add_argument(
         '--profile',
         action='store_true',

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -88,6 +88,7 @@ def run_vllm(
               gpu_memory_utilization=gpu_memory_utilization,
               enforce_eager=enforce_eager,
               kv_cache_dtype=kv_cache_dtype,
+              kv_quant_params_path=args.kv_quant_params_path,
               device=device,
               enable_prefix_caching=enable_prefix_caching)
 
@@ -300,10 +301,16 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8_e5m2", "int8"],
         default="auto",
         help=
         'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument(
+        "--kv-quant-params-path",
+        type=str,
+        default=None,
+        help='Path to scales and zero points of kv cache quantizaiton '
+        'when kv cache dtype is int8.')
     parser.add_argument(
         "--device",
         type=str,

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -74,6 +74,18 @@ def main(
                                                             device=device)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
+    # Prepare kv quant parameters for kv_cache_dtype=int8.
+    # NOTE(zhangying): These parameters only work when kv_cache_dtype is int8.
+    # They have no influence on other kv_cache_dtypes, like auto and fp8_e5m2.
+    # For Llama-13B, we find that the key scale distribution in [0.05, 0.15],
+    # the value scale distribution range is [0.005, 0.10],
+    # the key zero point distribution range is [-1.5, 1.5],
+    # the value zero point distribution range is [-2.0, 2.0].
+    k_scale = random.random() * 0.10 + 0.05
+    v_scale = random.random() * 0.095 + 0.005
+    k_zp = random.random() * 3.0 - 1.5
+    v_zp = random.random() * 4.0 - 2.0
+
     # Prepare for the paged attention kernel.
     output = torch.empty_like(query)
     if version == "v2":
@@ -112,6 +124,10 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_context_len,
                     alibi_slopes,
                     kv_cache_dtype,
+                    k_scale,
+                    k_zp,
+                    v_scale,
+                    v_zp,
                 )
             elif version == "v2":
                 ops.paged_attention_v2(
@@ -130,6 +146,10 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_context_len,
                     alibi_slopes,
                     kv_cache_dtype,
+                    k_scale,
+                    k_zp,
+                    v_scale,
+                    v_zp,
                 )
             else:
                 raise ValueError(f"Invalid version: {version}")
@@ -179,7 +199,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8_e5m2", "int8"],
         default="auto",
         help=
         'Data type for kv cache storage. If "auto", will use model data type.')

diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
@@ -4,4 +4,5 @@
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
 #include "dtype_bfloat16.cuh"
+#include "dtype_int8.cuh"
 #include "dtype_fp8_e5m2.cuh"