vllm-project · afeldman-nm · Feb 9, 2025 · Feb 9, 2025 · Feb 9, 2025 · Feb 10, 2025
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
@@ -250,6 +250,65 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     assert "".join(chunks) == single_output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
+                                     model_name: str):
+    """Parallel sampling without streaming.
+    A single request output contains a list of completions.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    completion = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=max_tokens,
+                                                 n=n,
+                                                 stream=False)
+
+    for choice in completion.choices:
+        assert choice.finish_reason is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             stream=True)
+    chunks: List[List[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == n
+    for chunk in chunks:
+        assert len(chunk) == max_tokens
+        print("".join(chunk))
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",

@@ -2,7 +2,7 @@
 
 import asyncio
 import os
-from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
+from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 import numpy as np
 
@@ -24,6 +24,8 @@
 from vllm.utils import cdiv, kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import (ParallelSamplingOutputProcessor,
+                                              ParentRequestState)
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
@@ -50,6 +52,8 @@ def __init__(
         assert start_engine_loop
 
         self.model_config = vllm_config.model_config
+        self.enable_prefix_caching = (
+            vllm_config.cache_config.enable_prefix_caching)
 
         self.log_requests = log_requests
         self.log_stats = log_stats
@@ -170,7 +174,7 @@ async def add_request(
     # requests we don't need to send multiple messages to core proc,
     # and so we don't need multiple streams which then get
     # re-multiplexed in the API server anyhow.
-    async def generate(
+    async def _generate(
         self,
         prompt: PromptType,
         sampling_params: SamplingParams,
@@ -241,6 +245,131 @@ async def generate(
             await self.abort(request_id)
             raise
 
+    async def _parallel_sampling_child_gen(
+        self,
+        child_gen: AsyncGenerator[RequestOutput, None],
+        output_processor: ParallelSamplingOutputProcessor,
+        index: int,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """A single parallel sampling child request
+        output generator.
+
+        Each parallel sampling request triggers at
+        least two child requests. This generator
+        yields zero or more request outputs to
+        return to the caller, as they become
+        available.
+
+        Args:
+          child_gen: generator for child request
+                     outputs.
+          output_processor: transform child request
+                            outputs into parent
+                            request outputs
+          index: index within the `n` child requests
+
+        Returns:
+          Yields zero or more request outputs to return
+          to the caller.
+        """
+        async for out in child_gen:
+            if req_out := output_processor.process_output(out, index):
+                yield req_out
+
+    async def _generate_parallel_sampling(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """Generation completes for parallel sampling requests."""
+
+        parent_state = ParentRequestState(request_id, sampling_params)
+        output_processor = ParallelSamplingOutputProcessor(parent_state)
+        n = parent_state.n
+
+        # Adapted from sglang:
+        # https://github.com/sgl-project/sglang/blob/
+        # 4fe92bfca5517f3cf5ca967fc5fcfdb7cf335f30/
+        # python/sglang/srt/managers/
+        # tokenizer_manager.py#L456-L532
+
+        if self.enable_prefix_caching:
+            # If engine uses APC, generate a “warmup request” with
+            # max_tokens=1 which populates the APC
+            w_sampling_params = parent_state.get_warmup_sampling_params()
+            async for _ in self._generate(
+                    prompt,
+                    w_sampling_params,
+                    parent_state.get_warmup_request_id(),
+                    lora_request,
+                    trace_headers,
+                    prompt_adapter_request,
+                    priority,
+            ):
+                # Exhaust the generator
+                pass
+
+        # Aggregate generators for n child requests
+        gens: List[AsyncGenerator[RequestOutput, None]] = []
+        active: Dict[asyncio.Task, int] = {}
+        seed = sampling_params.seed
+        for idx in range(n):
+            c_sampling_params = parent_state.get_child_sampling_params(seed)
+            if seed is not None:
+                seed += 1
+            child_gen = self._generate(
+                prompt,
+                c_sampling_params,
+                parent_state.get_child_request_id(idx),
+                lora_request,
+                trace_headers,
+                prompt_adapter_request,
+                priority,
+            )
+            gen = self._parallel_sampling_child_gen(child_gen,
+                                                    output_processor, idx)
+            gens.append(gen)
+            active[asyncio.create_task(gen.__anext__())] = idx  # type: ignore
+
+        try:
+            while active:
+                done, _ = await asyncio.wait(
+                    active.keys(), return_when=asyncio.FIRST_COMPLETED)
+                for task in done:
+                    idx = active.pop(task)
+                    try:
+                        result = task.result()
+                        yield result
+                        # Schedule the next result
+                        active[asyncio.create_task(
+                            gens[idx].__anext__())] = idx  # type: ignore
+                    except StopAsyncIteration:
+                        continue
+        finally:
+            for task in active:
+                task.cancel()
+
+    def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        n = sampling_params.n
+        _generate  = self._generate if n is None or n == 1 \
+            else self._generate_parallel_sampling
+        return _generate(prompt, sampling_params, request_id, lora_request,
+                         trace_headers, prompt_adapter_request, priority)
+
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""