Skip to content

Commit

Permalink
[V1][Metrics] Add per-request prompt/generation_tokens histograms (vl…
Browse files Browse the repository at this point in the history
…lm-project#12516)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
  • Loading branch information
markmc authored and ShangmingCai committed Feb 10, 2025
1 parent a069e3b commit 107ddcb
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 14 deletions.
6 changes: 6 additions & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:num_requests_waiting",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
]


Expand Down
3 changes: 1 addition & 2 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ def __init__(
self.log_stats = log_stats
self.stat_loggers: List[StatLoggerBase] = [
LoggingStatLogger(),
PrometheusStatLogger(labels=dict(
model_name=self.model_config.served_model_name)),
PrometheusStatLogger(vllm_config.model_config),
]

# Tokenizer (+ ensure liveness if running in another process).
Expand Down
11 changes: 9 additions & 2 deletions vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.v1.engine.detokenizer import (DetokenizerOutput,
IncrementalDetokenizer)
from vllm.v1.metrics.stats import IterationStats
from vllm.v1.metrics.stats import IterationStats, RequestStateStats


@dataclass
Expand Down Expand Up @@ -37,6 +37,8 @@ def __init__(
self.is_prefilling = True
self.queue = queue

self.stats = RequestStateStats()

@classmethod
def from_new_request(
cls,
Expand Down Expand Up @@ -146,7 +148,8 @@ def process_outputs(
# 1) Compute stats for this iteration.
iteration_stats.update_from_output(engine_core_output,
req_state.is_prefilling,
req_state.prompt_len)
req_state.prompt_len,
req_state.stats)
req_state.is_prefilling = False

# 2) Detokenize the token ids into text.
Expand All @@ -171,6 +174,10 @@ def process_outputs(
# detected stop string, abort needed in EngineCore.
reqs_to_abort.append(req_id)

# Track per-request stats
iteration_stats.update_from_finished_request(
request_output, req_state.stats)

return OutputProcessorOutput(
request_outputs=request_outputs,
reqs_to_abort=reqs_to_abort,
Expand Down
60 changes: 54 additions & 6 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import time
from abc import ABC, abstractmethod
from typing import Dict, List
from typing import List

import numpy as np
import prometheus_client

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.v1.metrics.stats import IterationStats, SchedulerStats

Expand Down Expand Up @@ -78,13 +79,13 @@ def log(self, scheduler_stats: SchedulerStats,

class PrometheusStatLogger(StatLoggerBase):

def __init__(self, labels: Dict[str, str]):
self.labels = labels
def __init__(self, model_config: ModelConfig):
self._unregister_vllm_metrics()

labelnames = self.labels.keys()
labelvalues = self.labels.values()
labelnames = ["model_name"]
labelvalues = [model_config.served_model_name]

self._unregister_vllm_metrics()
max_model_len = model_config.max_model_len

self.gauge_scheduler_running = prometheus_client.Gauge(
name="vllm:num_requests_running",
Expand All @@ -106,6 +107,20 @@ def __init__(self, labels: Dict[str, str]):
documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues)

self.histogram_num_prompt_tokens_request = \
prometheus_client.Histogram(
name="vllm:request_prompt_tokens",
documentation="Number of prefill tokens processed.",
buckets=build_1_2_5_buckets(max_model_len),
labelnames=labelnames).labels(*labelvalues)

self.histogram_num_generation_tokens_request = \
prometheus_client.Histogram(
name="vllm:request_generation_tokens",
documentation="Number of generation tokens processed.",
buckets=build_1_2_5_buckets(max_model_len),
labelnames=labelnames).labels(*labelvalues)

def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log to prometheus."""
Expand All @@ -116,9 +131,42 @@ def log(self, scheduler_stats: SchedulerStats,
self.counter_generation_tokens.inc(
iteration_stats.num_generation_tokens)

for finished_request in iteration_stats.finished_requests:
self.histogram_num_prompt_tokens_request.observe(
finished_request.num_prompt_tokens)
self.histogram_num_generation_tokens_request.observe(
finished_request.num_generation_tokens)

@staticmethod
def _unregister_vllm_metrics():
# Unregister any existing vLLM collectors (for CI/CD
for collector in list(prometheus_client.REGISTRY._collector_to_names):
if hasattr(collector, "_name") and "vllm" in collector._name:
prometheus_client.REGISTRY.unregister(collector)


def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
"""
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values until the value exceeds the specified maximum.
"""
exponent = 0
buckets: List[int] = []
while True:
for m in mantissa_lst:
value = m * 10**exponent
if value <= max_value:
buckets.append(value)
else:
return buckets
exponent += 1


def build_1_2_5_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return build_buckets([1, 2, 5], max_value)
36 changes: 32 additions & 4 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, List

if TYPE_CHECKING:
from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput


Expand All @@ -16,24 +17,51 @@ class SchedulerStats:
# gpu_prefix_cache_hit_rate: float = 0.0


@dataclass
class RequestStateStats:
"""Stats that need to be tracked across delta updates."""

num_generation_tokens: int = 0


@dataclass
class FinishedRequestStats:
"""Stats associated with a finished request."""

num_prompt_tokens: int = 0
num_generation_tokens: int = 0


class IterationStats:
"""Stats associated with a single set of EngineCoreOutputs."""

def __init__(self, log_stats: bool):
self.log_stats = log_stats
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
self.finished_requests: List[FinishedRequestStats] = []

def update_from_output(self, output: "EngineCoreOutput",
is_prefilling: bool, prompt_len: int):
is_prefilling: bool, prompt_len: int,
request_state_stats: RequestStateStats):
if not self.log_stats:
return

self.num_generation_tokens += len(output.new_token_ids)
num_new_generation_tokens = len(output.new_token_ids)

self.num_generation_tokens += num_new_generation_tokens
if is_prefilling:
# This relies on the invariant that EngineCore does
# not stream outputs for partially completed prefills
# (scheduler.update_from_output makes EngineCoreOutput
# iff num_computed_tokens == num_tokens).
assert (len(output.new_token_ids) > 0)
assert (num_new_generation_tokens > 0)
self.num_prompt_tokens += prompt_len

request_state_stats.num_generation_tokens += num_new_generation_tokens

def update_from_finished_request(self, request_output: "RequestOutput",
request_state_stats: RequestStateStats):
self.finished_requests.append(
FinishedRequestStats(len(request_output.prompt_token_ids),
request_state_stats.num_generation_tokens))

0 comments on commit 107ddcb

Please sign in to comment.