Merge branch 'main' into update_gptq

vllm-project · Aug 30, 2024 · 7bb6dd3 · 7bb6dd3
2 parents e53d3ba + 2684efc
commit 7bb6dd3
Show file tree

Hide file tree

Showing 151 changed files with 3,339 additions and 1,117 deletions.
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -61,15 +61,22 @@ class BenchmarkMetrics:
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
-    p99_ttft_ms: float
+    percentiles_ttft_ms: List[Tuple[float, float]]
     mean_tpot_ms: float
     median_tpot_ms: float
     std_tpot_ms: float
-    p99_tpot_ms: float
+    percentiles_tpot_ms: List[Tuple[float, float]]
     mean_itl_ms: float
     median_itl_ms: float
     std_itl_ms: float
-    p99_itl_ms: float
+    percentiles_itl_ms: List[Tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: List[Tuple[float, float]]
 
 
 def sample_sharegpt_requests(
@@ -235,13 +242,16 @@ def calculate_metrics(
     outputs: List[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[float],
 ) -> Tuple[BenchmarkMetrics, List[int]]:
     actual_output_lens: List[int] = []
     total_input = 0
     completed = 0
     itls: List[float] = []
     tpots: List[float] = []
     ttfts: List[float] = []
+    e2els: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -258,6 +268,7 @@ def calculate_metrics(
                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
             completed += 1
         else:
             actual_output_lens.append(0)
@@ -276,17 +287,25 @@ def calculate_metrics(
         output_throughput=sum(actual_output_lens) / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
-        median_ttft_ms=np.median(ttfts or 0) * 1000,
         std_ttft_ms=np.std(ttfts or 0) * 1000,
-        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+                             for p in selected_percentiles],
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
-        median_tpot_ms=np.median(tpots or 0) * 1000,
         std_tpot_ms=np.std(tpots or 0) * 1000,
-        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+                             for p in selected_percentiles],
         mean_itl_ms=np.mean(itls or 0) * 1000,
-        median_itl_ms=np.median(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
-        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+                            for p in selected_percentiles],
+        mean_e2el_ms=np.median(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.mean(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+                             for p in selected_percentiles],
     )
 
     return metrics, actual_output_lens
@@ -304,6 +323,8 @@ async def benchmark(
     request_rate: float,
     disable_tqdm: bool,
     profile: bool,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[str],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -392,6 +413,8 @@ async def benchmark(
         outputs=outputs,
         dur_s=benchmark_duration,
         tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
     )
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -407,23 +430,6 @@ async def benchmark(
                                     metrics.input_throughput))
     print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                     metrics.output_throughput))
-    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
-    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
-                                    metrics.median_ttft_ms))
-    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
-    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
-                               n=50,
-                               c='-'))
-    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
-    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
-                                    metrics.median_tpot_ms))
-    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
-    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
-    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
-    print("=" * 50)
 
     result = {
         "duration": benchmark_duration,
@@ -433,25 +439,54 @@ async def benchmark(
         "request_throughput": metrics.request_throughput,
         "input_throughput": metrics.input_throughput,
         "output_throughput": metrics.output_throughput,
-        "mean_ttft_ms": metrics.mean_ttft_ms,
-        "median_ttft_ms": metrics.median_ttft_ms,
-        "std_ttft_ms": metrics.std_ttft_ms,
-        "p99_ttft_ms": metrics.p99_ttft_ms,
-        "mean_tpot_ms": metrics.mean_tpot_ms,
-        "median_tpot_ms": metrics.median_tpot_ms,
-        "std_tpot_ms": metrics.std_tpot_ms,
-        "p99_tpot_ms": metrics.p99_tpot_ms,
-        "mean_itl_ms": metrics.mean_itl_ms,
-        "median_itl_ms": metrics.median_itl_ms,
-        "std_itl_ms": metrics.std_itl_ms,
-        "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,
         "ttfts": [output.ttft for output in outputs],
         "itls": [output.itl for output in outputs],
         "generated_texts": [output.generated_text for output in outputs],
         "errors": [output.error for output in outputs],
     }
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function print and add statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{:<40} {:<10.2f}".format(
+            f"Mean {metric_name} (ms):",
+            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+        print("{:<40} {:<10.2f}".format(
+            f"Median {metric_name} (ms):",
+            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics,
+                                f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+                                            value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT",
+                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
     return result
 
 
@@ -550,6 +585,10 @@ def main(args: argparse.Namespace):
             request_rate=args.request_rate,
             disable_tqdm=args.disable_tqdm,
             profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[
+                float(p) for p in args.metric_percentiles.split(",")
+            ],
         ))
 
     # Save config and results to json
@@ -765,6 +804,23 @@ def main(args: argparse.Namespace):
         "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
         " format.",
     )
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-seperated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        "Default value is \"ttft,tpot,itl\".")
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-seperated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\". "
+        "Use \"--percentile-metrics\" to select metrics.",
+    )
 
     args = parser.parse_args()
     main(args)
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -11,5 +11,6 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.3.4
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+mistral_common >= 1.3.4
+openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -51,6 +51,10 @@ Decoder-only Language Models
     - DeciLM
     - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
     -
+  * - :code:`ExaoneForCausalLM`
+    - EXAONE-3
+    - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
+    - ✅︎
   * - :code:`FalconForCausalLM`
     - Falcon
     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
@@ -111,6 +111,32 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 :prog: vllm serve
 ```
 
+### Config file
+
+The `serve` module can also accept arguments from a config file in
+`yaml` format. The arguments in the yaml must be specified using the 
+long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): 
+
+For example:
+
+```yaml
+# config.yaml
+
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+```bash
+$ vllm serve SOME_MODEL --config config.yaml
+```
+---
+**NOTE**  
+In case an argument is supplied using command line and the config file, the value from the commandline will take precedence.
+The order of priorities is `command line > config file values > defaults`.
+
+---
+
 ## Tool calling in the chat completion API
 vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
 

diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
@@ -1,5 +1,12 @@
+import os
+
 from vllm import LLM, SamplingParams
 
+# creates XLA hlo graphs for all the context length buckets.
+os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+# creates XLA hlo graphs for all the token gen buckets.
+os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+
 # Sample prompts.
 prompts = [
     "Hello, my name is",
@@ -19,8 +26,8 @@
     # Currently, this is a known limitation in continuous batching support
     # in transformers-neuronx.
     # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=128,
-    block_size=128,
+    max_model_len=2048,
+    block_size=2048,
     # The device can be automatically detected when AWS Neuron SDK is installed.
     # The device argument can be either unspecified for automated detection,
     # or explicitly assigned.

diff --git a/requirements-common.txt b/requirements-common.txt
@@ -27,3 +27,4 @@ soundfile # Required for audio processing
 gguf == 0.9.1
 importlib_metadata
 mistral_common >= 1.3.4
+pyyaml
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
@@ -4,4 +4,4 @@
 # Dependencies for TPU
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
-ray
+ray[default]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -209,8 +209,14 @@ class HfRunner:
 
     def wrap_device(self, input: _T) -> _T:
         if not is_cpu():
+            # Check if the input is already on the GPU
+            if hasattr(input, 'device') and input.device.type == "cuda":
+                return input  # Already on GPU, no need to move
             return input.to("cuda")
         else:
+            # Check if the input is already on the CPU
+            if hasattr(input, 'device') and input.device.type == "cpu":
+                return input  # Already on CPU, no need to move
             return input.to("cpu")
 
     def __init__(

diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml
@@ -0,0 +1,2 @@
+port: 12312
+tensor_parallel_size: 2
diff --git a/tests/models/test_intern_vit.py b/tests/models/test_intern_vit.py
@@ -6,8 +6,6 @@
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
-from vllm.model_executor.models.intern_vit import InternVisionModel
-
 from ..conftest import _ImageAssets, cleanup
 
 pytestmark = pytest.mark.vlm
@@ -49,6 +47,7 @@ def run_intern_vit_test(
         for pixel_value in pixel_values
     ]
 
+    from vllm.model_executor.models.intern_vit import InternVisionModel
     vllm_model = InternVisionModel(config)
     vllm_model.load_weights(hf_model.state_dict().items())