Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Add OpenTelemetry support #4687

Merged
merged 23 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8d8d4e2
Add OpenTelemetry support
ronensc May 6, 2024
2aa077d
Create a trace only when trace_context is not None
ronensc May 28, 2024
dd9ffcc
Use constants for span attribute names
ronensc May 29, 2024
2ce871d
Remove required dependency on opentelemetry
ronensc Jun 3, 2024
5ebb1f3
Add --otlp_endpoint flag
ronensc Jun 3, 2024
dcdac44
Write a one-time warning log for requests with a tracing header when …
ronensc Jun 4, 2024
aa972bb
Relax condition
ronensc Jun 4, 2024
5b57958
Decouple tracing for log stats
ronensc Jun 5, 2024
c7f1005
Rename --otlp-endpoint -> --otlp-traces-endpoint
ronensc Jun 5, 2024
ca49ef3
Update Otel.md
ronensc Jun 5, 2024
f69936e
Support grpc exporter and make it the default
ronensc Jun 5, 2024
a144cbf
Update Otel.md
ronensc Jun 5, 2024
306c805
Rename OTEL_EXPORTER_OTLP_ENDPOINT -> OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
ronensc Jun 5, 2024
70e58ad
Change exporter in dummy_client to grpc
ronensc Jun 5, 2024
c97d72f
Add --otlp-traces-endpoint flag to benchmark_latency.py
ronensc Jun 10, 2024
964e613
Add e2e test for tracing
ronensc Jun 11, 2024
edfdf55
Add tracing to CI
ronensc Jun 11, 2024
b7d6f7e
Add is_tracing_enabled()
ronensc Jun 13, 2024
b5d2735
Pass trace headers instead of trace context
ronensc Jun 13, 2024
2f8b0ab
Handle passing None to extract_trace_context
ronensc Jun 13, 2024
9fa949e
Assert otel is installed in init_tracer()
ronensc Jun 17, 2024
a6d17f3
Fix bug itroduced by git rebase
ronensc Jun 17, 2024
21dba06
Fix lint error
ronensc Jun 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,15 @@ steps:
#mirror_hardwares: [amd]
command: pytest -v -s quantization

- label: Tracing Test
commands:
- "pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai"
- pytest -v -s tracing

- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
Expand Down
48 changes: 28 additions & 20 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,29 @@ def main(args: argparse.Namespace):

# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization,
load_format=args.load_format,
distributed_executor_backend=args.distributed_executor_backend)
llm = LLM(
model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization,
load_format=args.load_format,
distributed_executor_backend=args.distributed_executor_backend,
otlp_traces_endpoint=args.otlp_traces_endpoint,
)

sampling_params = SamplingParams(
n=args.n,
Expand Down Expand Up @@ -254,5 +257,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
args = parser.parse_args()
main(args)
82 changes: 82 additions & 0 deletions examples/production_monitoring/Otel.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Setup OpenTelemetry POC

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you update this requires dependencies (and add pip install example?)

1. Install OpenTelemetry packages:
```
pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai
```

1. Start Jaeger in a docker container:
```
# From: https://www.jaegertracing.io/docs/1.57/getting-started/
docker run --rm --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-p 16686:16686 \
-p 4317:4317 \
-p 4318:4318 \
-p 14250:14250 \
-p 14268:14268 \
-p 14269:14269 \
-p 9411:9411 \
jaegertracing/all-in-one:1.57
```

1. In a new shell, export Jaeger IP:
```
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
```
Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
```
export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```

1. In a new shell, send requests with trace context from a dummy client
```
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
export OTEL_SERVICE_NAME="client-service"
python dummy_client.py
```

1. Open Jaeger webui: http://localhost:16686/

In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
![Traces](https://i.imgur.com/GYHhFjo.png)

1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
![Spans details](https://i.imgur.com/OPf6CBL.png)

## Exporter Protocol
OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
```
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
```

## Instrumentation of FastAPI
OpenTelemetry allows automatic instrumentation of FastAPI.
1. Install the instrumentation library
```
pip install opentelemetry-instrumentation-fastapi
```

1. Run vLLM with `opentelemetry-instrument`
```
opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m"
```

1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.

![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
35 changes: 35 additions & 0 deletions examples/production_monitoring/dummy_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import requests
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter)
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
ConsoleSpanExporter)
from opentelemetry.trace import SpanKind, set_tracer_provider
from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator)

trace_provider = TracerProvider()
set_tracer_provider(trace_provider)

trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))

tracer = trace_provider.get_tracer("dummy-client")

url = "http://localhost:8000/v1/completions"
with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
prompt = "San Francisco is a"
span.set_attribute("prompt", prompt)
headers = {}
TraceContextTextMapPropagator().inject(headers)
payload = {
"model": "facebook/opt-125m",
"prompt": prompt,
"max_tokens": 10,
"best_of": 20,
"n": 3,
"use_beam_search": "true",
"temperature": 0.0,
# "stream": True,
}
response = requests.post(url, headers=headers, json=payload)
Empty file added tests/tracing/__init__.py
Empty file.
116 changes: 116 additions & 0 deletions tests/tracing/test_tracing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import os
import threading
from concurrent import futures
from typing import Callable, Dict, Iterable, Literal

import grpc
import pytest
from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
ExportTraceServiceResponse)
from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
TraceServiceServicer, add_TraceServiceServicer_to_server)
from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
from opentelemetry.sdk.environment_variables import (
OTEL_EXPORTER_OTLP_TRACES_INSECURE)

from vllm import LLM, SamplingParams
from vllm.tracing import SpanAttributes

FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"

FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
'array_value']


def decode_value(value: AnyValue):
field_decoders: Dict[FieldName, Callable] = {
"bool_value": (lambda v: v.bool_value),
"string_value": (lambda v: v.string_value),
"int_value": (lambda v: v.int_value),
"double_value": (lambda v: v.double_value),
"array_value":
(lambda v: [decode_value(item) for item in v.array_value.values]),
}
for field, decoder in field_decoders.items():
if value.HasField(field):
return decoder(value)
raise ValueError(f"Couldn't decode value: {value}")


def decode_attributes(attributes: Iterable[KeyValue]):
return {kv.key: decode_value(kv.value) for kv in attributes}


class FakeTraceService(TraceServiceServicer):

def __init__(self):
self.request = None
self.evt = threading.Event()

def Export(self, request, context):
self.request = request
self.evt.set()
return ExportTraceServiceResponse()


@pytest.fixture
def trace_service():
"""Fixture to set up a fake gRPC trace service"""
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
service = FakeTraceService()
add_TraceServiceServicer_to_server(service, server)
server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
server.start()

yield service

server.stop(None)


def test_traces(trace_service):
os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"

sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
model = "facebook/opt-125m"
llm = LLM(
model=model,
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
)
prompts = ["This is a short prompt"]
outputs = llm.generate(prompts, sampling_params=sampling_params)

timeout = 5
if not trace_service.evt.wait(timeout):
raise TimeoutError(
f"The fake trace service didn't receive a trace within "
f"the {timeout} seconds timeout")

attributes = decode_attributes(trace_service.request.resource_spans[0].
scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
assert attributes.get(
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
assert attributes.get(
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
assert attributes.get(
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get(
SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
metrics = outputs[0].metrics
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
ttft = metrics.first_token_time - metrics.arrival_time
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
e2e_time = metrics.finished_time - metrics.arrival_time
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
13 changes: 13 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.model_executor.models import ModelRegistry
from vllm.tracing import is_otel_installed
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
is_hip, is_neuron, is_tpu)
Expand Down Expand Up @@ -1362,6 +1363,17 @@ def __post_init__(self):
f"must be one of {valid_guided_backends}")


@dataclass
class ObservabilityConfig:
"""Configuration for observability."""
otlp_traces_endpoint: Optional[str] = None

def __post_init__(self):
if not is_otel_installed() and self.otlp_traces_endpoint is not None:
raise ValueError("OpenTelemetry packages must be installed before "
"configuring 'otlp_traces_endpoint'")


@dataclass(frozen=True)
class EngineConfig:
"""Dataclass which contains all engine-related configuration. This
Expand All @@ -1378,6 +1390,7 @@ class EngineConfig:
vision_language_config: Optional[VisionLanguageConfig]
speculative_config: Optional[SpeculativeConfig]
decoding_config: Optional[DecodingConfig]
observability_config: Optional[ObservabilityConfig]

def __post_init__(self):
"""Verify configs are valid & consistent with each other.
Expand Down
Loading
Loading