vllm-project · rkooo567 · Jun 18, 2024 · May 6, 2024 · May 28, 2024 · May 29, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -159,6 +159,15 @@ steps:
   #mirror_hardwares: [amd]
   command: pytest -v -s quantization
 
+- label: Tracing Test
+  commands: 
+    - "pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai"
+    - pytest -v -s tracing
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   mirror_hardwares: [amd]

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -20,26 +20,29 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization,
-              load_format=args.load_format,
-              distributed_executor_backend=args.distributed_executor_backend)
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        load_format=args.load_format,
+        distributed_executor_backend=args.distributed_executor_backend,
+        otlp_traces_endpoint=args.otlp_traces_endpoint,
+    )
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -254,5 +257,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         help='Backend to use for distributed serving. When more than 1 GPU '
         'is used, will be automatically set to "ray" if installed '
         'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--otlp-traces-endpoint',
+        type=str,
+        default=None,
+        help='Target URL to which OpenTelemetry traces will be sent.')
     args = parser.parse_args()
     main(args)
diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md
@@ -0,0 +1,82 @@
+# Setup OpenTelemetry POC
+
+1. Install OpenTelemetry packages:
+    ```
+    pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai
+    ```
+
+1. Start Jaeger in a docker container:
+    ```
+    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
+    docker run --rm --name jaeger \
+        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
+        -p 6831:6831/udp \
+        -p 6832:6832/udp \
+        -p 5778:5778 \
+        -p 16686:16686 \
+        -p 4317:4317 \
+        -p 4318:4318 \
+        -p 14250:14250 \
+        -p 14268:14268 \
+        -p 14269:14269 \
+        -p 9411:9411 \
+        jaegertracing/all-in-one:1.57
+    ```
+
+1. In a new shell, export Jaeger IP:
+    ```
+    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    ```
+    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
+    ```
+    export OTEL_SERVICE_NAME="vllm-server"
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    ```
+
+1. In a new shell, send requests with trace context from a dummy client
+    ```
+    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    export OTEL_SERVICE_NAME="client-service"
+    python dummy_client.py
+    ```
+
+1. Open Jaeger webui: http://localhost:16686/
+
+    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
+    ![Traces](https://i.imgur.com/GYHhFjo.png)
+
+1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
+![Spans details](https://i.imgur.com/OPf6CBL.png)
+
+## Exporter Protocol
+OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
+By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
+```
+export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+```
+
+## Instrumentation of FastAPI
+OpenTelemetry allows automatic instrumentation of FastAPI.
+1. Install the instrumentation library
+    ```
+    pip install opentelemetry-instrumentation-fastapi
+    ```
+
+1. Run vLLM with `opentelemetry-instrument`
+    ```
+    opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" 
+    ```
+
+1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
+
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
diff --git a/examples/production_monitoring/dummy_client.py b/examples/production_monitoring/dummy_client.py
@@ -0,0 +1,35 @@
+import requests
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+    OTLPSpanExporter)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
+                                            ConsoleSpanExporter)
+from opentelemetry.trace import SpanKind, set_tracer_provider
+from opentelemetry.trace.propagation.tracecontext import (
+    TraceContextTextMapPropagator)
+
+trace_provider = TracerProvider()
+set_tracer_provider(trace_provider)
+
+trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+tracer = trace_provider.get_tracer("dummy-client")
+
+url = "http://localhost:8000/v1/completions"
+with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
+    prompt = "San Francisco is a"
+    span.set_attribute("prompt", prompt)
+    headers = {}
+    TraceContextTextMapPropagator().inject(headers)
+    payload = {
+        "model": "facebook/opt-125m",
+        "prompt": prompt,
+        "max_tokens": 10,
+        "best_of": 20,
+        "n": 3,
+        "use_beam_search": "true",
+        "temperature": 0.0,
+        # "stream": True,
+    }
+    response = requests.post(url, headers=headers, json=payload)
diff --git a/tests/tracing/__init__.py b/tests/tracing/__init__.py
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
@@ -0,0 +1,116 @@
+import os
+import threading
+from concurrent import futures
+from typing import Callable, Dict, Iterable, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: Dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service():
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(trace_service):
+    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+
+    sampling_params = SamplingParams(temperature=0.01,
+                                     top_p=0.1,
+                                     max_tokens=256)
+    model = "facebook/opt-125m"
+    llm = LLM(
+        model=model,
+        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+    )
+    prompts = ["This is a short prompt"]
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+    timeout = 5
+    if not trace_service.evt.wait(timeout):
+        raise TimeoutError(
+            f"The fake trace service didn't receive a trace within "
+            f"the {timeout} seconds timeout")
+
+    attributes = decode_attributes(trace_service.request.resource_spans[0].
+                                   scope_spans[0].spans[0].attributes)
+    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
+    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        outputs[0].prompt_token_ids)
+    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+    assert attributes.get(
+        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+    metrics = outputs[0].metrics
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+    ttft = metrics.first_token_time - metrics.arrival_time
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+    e2e_time = metrics.finished_time - metrics.arrival_time
+    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
diff --git a/vllm/config.py b/vllm/config.py
@@ -10,6 +10,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
+from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
                         is_hip, is_neuron, is_tpu)
@@ -1362,6 +1363,17 @@ def __post_init__(self):
                              f"must be one of {valid_guided_backends}")
 
 
+@dataclass
+class ObservabilityConfig:
+    """Configuration for observability."""
+    otlp_traces_endpoint: Optional[str] = None
+
+    def __post_init__(self):
+        if not is_otel_installed() and self.otlp_traces_endpoint is not None:
+            raise ValueError("OpenTelemetry packages must be installed before "
+                             "configuring 'otlp_traces_endpoint'")
+
+
 @dataclass(frozen=True)
 class EngineConfig:
     """Dataclass which contains all engine-related configuration. This
@@ -1378,6 +1390,7 @@ class EngineConfig:
     vision_language_config: Optional[VisionLanguageConfig]
     speculative_config: Optional[SpeculativeConfig]
     decoding_config: Optional[DecodingConfig]
+    observability_config: Optional[ObservabilityConfig]
 
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.