From 4d0932b58cc0604d21475020f74b17e63e26238b Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 19 Mar 2024 13:27:20 -0700
Subject: [PATCH 01/41] Disable KV Cache for Embedding serving and Add
 Embedding generation

Apply suggestions from code review

- Fix typo EmeddingResponseData
- Apply nit suggestion on EmbeddingRequest check

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>

Change v1/embedding to v1/embeddings

Add check for EmbeddingRequest input tokens num length

Add TODO comments for vllm-specific ValidationError in embedding

Add miscellaneous small fixes

- Format files
- Remove duplicate imports in llm_engine
- Fix imports in llama_embedding
- Remove doc in arg_utils

Separate llama_embedding from llama

- Add LlamaEmbeddingModel in llama_embedding.py
- Add _EMBEDDING_MODELS in models/__init__
- Move _MODELS to _GENERATION_MODELS

Refactor ModelRunner and remove profiling for embedding

Refactor ModelRunner
- Add EmbeddingModelRunner to override execute_model

Remove profiling for embedding
- Remove profile_max_batched_tokens_for_embedding in worker
- Set default max_num_batched_tokens to 32768 for embedding

Add BlockSpaceManagerV3 and remove DummyBlockSpaceManager

- Use BlockSpaceManagerV3 for simple block management for embedding
- Fix Scheduler.prompt_limit for embedding
- Fix typo

Replace 0.8 in profiling max_num_batched_tokens

- Utilize gpu_memory_utilization factor instead

Refactor RequestOutput and SequenceGroupOutput

- Add an abstract base class of RequestOutput, separate Completion and
Embedding
- Add an abstract base class of SequenceGroupOutput, separate
Completion and Embedding

Add docstring for embedding_mode in ModelConfig

Remove max_tokens in EmbeddingRequest

Fix generation in model_runner.py and formatting
- Fix model.sample and embedding generation in model_runner.py
- Format files changed
- Fix embedding in llama.py as input_tokens' shape has changed
- Enable embedding in ray_gpu_executor.py

Add Embedding generation

Add Embedding API to entrypoints/openai

- Add EmbeddingRequest and Response to protocol
- Add serving_embedding
- Add OpenAIServingEmbedding to api_server
- Make llm.py work with embedding

Add Embedding in outputs and sequence

- Add EmbeddingOutput, EmbeddingRequestOutput, RequestOutputFactory and
EmbeddingSequenceGroupOutput to support process the embedding output
sequence
- Update process output and sequence in *llm_engine to use embedding

Add MistralModel and embedding generation

- Add embedding and load_weights in LlamaModel to support forward
- Adapted from code examples in https://huggingface.co/intfloat/e5-mistral-7b-instruct
- Ensures the ops are run on GPU.
- Mistral uses LlamaModel
- Use embedding when embedding_mode is True in model_runner

Uniform typing in multiple files

- Add EmbeddingRequestOutput to the output typing in llm,
async_llm_engine
- Add validation for max_tokens in EmbeddingRequest
- Fix __repr__ of EmbeddingSequenceGroupOutput

Disable KV Cache for Embedding serving

Update profiling max_batch_size logic
- Return max_batch_size as each Ray worker runs the profiling once

Skip slot_mapping with embedding mode
slot_mapping is only used in model_executor/layers/attention.py when
kv_cache is not None. In embedding mode we pass None kv_cache. So no
need to process slot_mapping.

Disable KV cache for embedding mode
- Add embedding_mode to ModelConfig and SchedulerConfig
- Add profile_max_batched_tokens_for_embedding to profile the
max_num_batched_tokens for embedding server mode
- Add DummyBlockManager for No-op block management in embedding mode
- Add load_weights in llama.py to support embedding models
---
 docs/source/getting_started/quickstart.rst    |   2 +-
 docs/source/quantization/auto_awq.rst         |   2 +-
 docs/source/quantization/fp8_e5m2_kvcache.rst |   2 +-
 examples/llm_engine_example.py                |   4 +-
 examples/multilora_inference.py               |   4 +-
 examples/offline_inference.py                 |   5 +-
 examples/offline_inference_distributed.py     |   4 +-
 examples/offline_inference_neuron.py          |   5 +-
 examples/offline_inference_with_prefix.py     |   5 +-
 tests/async_engine/test_request_tracker.py    |   4 +-
 tests/spec_decode/utils.py                    |   6 +-
 tests/test_sequence.py                        |  13 +-
 vllm/__init__.py                              |   5 +-
 vllm/config.py                                |  10 +
 vllm/core/block_manager_v3.py                 |  84 +++++
 vllm/core/interfaces.py                       |   4 +
 vllm/core/scheduler.py                        |   9 +-
 vllm/engine/arg_utils.py                      |   9 +-
 vllm/engine/async_llm_engine.py               |  27 +-
 vllm/engine/llm_engine.py                     |  51 ++-
 vllm/entrypoints/llm.py                       |  15 +-
 vllm/entrypoints/openai/api_server.py         |  20 +-
 vllm/entrypoints/openai/protocol.py           |  23 ++
 vllm/entrypoints/openai/serving_chat.py       |  12 +-
 vllm/entrypoints/openai/serving_completion.py |  13 +-
 vllm/entrypoints/openai/serving_embedding.py  | 157 +++++++++
 vllm/entrypoints/openai/serving_engine.py     |  16 +-
 vllm/model_executor/layers/sampler.py         |   7 +-
 vllm/model_executor/models/__init__.py        |   8 +-
 vllm/model_executor/models/llama_embedding.py | 327 ++++++++++++++++++
 vllm/outputs.py                               | 111 +++++-
 vllm/sequence.py                              |  52 ++-
 vllm/spec_decode/spec_decode_worker.py        |   5 +-
 vllm/spec_decode/util.py                      |   5 +-
 vllm/worker/embedding_model_runner.py         |  70 ++++
 vllm/worker/model_runner.py                   |  18 +-
 vllm/worker/worker.py                         |   5 +-
 37 files changed, 1015 insertions(+), 104 deletions(-)
 create mode 100644 vllm/core/block_manager_v3.py
 create mode 100644 vllm/entrypoints/openai/serving_embedding.py
 create mode 100644 vllm/model_executor/models/llama_embedding.py
 create mode 100644 vllm/worker/embedding_model_runner.py

diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 7c44a96865a5..03758d630f4a 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -48,7 +48,7 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
 
     llm = LLM(model="facebook/opt-125m")
 
-Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
+Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``CompletionRequestOutput`` objects, which include all the output tokens.
 
 .. code-block:: python
 
diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst
index bbbb9aee78b3..e060dd29af0f 100644
--- a/docs/source/quantization/auto_awq.rst
+++ b/docs/source/quantization/auto_awq.rst
@@ -65,7 +65,7 @@ AWQ models are also supported directly through the LLM entrypoint:
 
     # Create an LLM.
     llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # Generate texts from the prompts. The output is a list of CompletionRequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst
index 337252a00aef..749caf623d31 100644
--- a/docs/source/quantization/fp8_e5m2_kvcache.rst
+++ b/docs/source/quantization/fp8_e5m2_kvcache.rst
@@ -22,7 +22,7 @@ Here is an example of how to enable this feature:
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     # Create an LLM.
     llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # Generate texts from the prompts. The output is a list of CompletionRequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
index a81c4b3e399c..790c63ac2c3b 100644
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -1,7 +1,7 @@
 import argparse
 from typing import List, Tuple
 
-from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm import CompletionRequestOutput, EngineArgs, LLMEngine, SamplingParams
 
 
 def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
@@ -34,7 +34,7 @@ def process_requests(engine: LLMEngine,
             engine.add_request(str(request_id), prompt, sampling_params)
             request_id += 1
 
-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: List[CompletionRequestOutput] = engine.step()
 
         for request_output in request_outputs:
             if request_output.finished:
diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
index 6aa25b4689ec..476466d3f033 100644
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -9,7 +9,7 @@
 
 from huggingface_hub import snapshot_download
 
-from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm import CompletionRequestOutput, EngineArgs, LLMEngine, SamplingParams
 from vllm.lora.request import LoRARequest
 
 
@@ -87,7 +87,7 @@ def process_requests(engine: LLMEngine,
                                lora_request=lora_request)
             request_id += 1
 
-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: List[CompletionRequestOutput] = engine.step()
 
         for request_output in request_outputs:
             if request_output.finished:
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 9b758fa2479f..bac7640174b1 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -12,8 +12,9 @@
 
 # Create an LLM.
 llm = LLM(model="facebook/opt-125m")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
+# Generate texts from the prompts. The output is a list of
+# CompletionRequestOutput objects that contain the prompt, generated text, and
+# other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
index e4f085fa6665..26bc8d683c5e 100644
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -25,8 +25,8 @@ def __init__(self):
 
     def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
         # Generate texts from the prompts.
-        # The output is a list of RequestOutput objects that contain the prompt,
-        # generated text, and other information.
+        # The output is a list of CompletionRequestOutput objects that contain
+        # the prompt, generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)
         prompt = []
         generated_text = []
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
index 5ecbbf020ab8..539634791b37 100755
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@@ -26,8 +26,9 @@
     # or explicitly assigned.
     device="neuron",
     tensor_parallel_size=2)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
+# Generate texts from the prompts. The output is a list of
+# CompletionRequestOutput objects that contain the prompt, generated text, and
+# other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index 7ed0563f14e0..d8bc2a5ea6a5 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -26,8 +26,9 @@
 
 generating_prompts = [prefix + prompt for prompt in prompts]
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
+# Generate texts from the prompts. The output is a list of
+# CompletionRequestOutput objects that contain the prompt, generated text, and
+# other information.
 outputs = llm.generate(generating_prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 7b1f4a9e1eb2..f9816540a0b4 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,7 +1,7 @@
 import pytest
 
 from vllm.engine.async_llm_engine import RequestTracker
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionRequestOutput
 
 
 @pytest.mark.asyncio
@@ -55,7 +55,7 @@ async def test_request_tracker():
     stream_5 = tracker.add_request("5")
     assert tracker.new_requests_event.is_set()
     tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
+        CompletionRequestOutput("2", "output", [], [], [], finished=True))
     await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
     assert not tracker.new_requests_event.is_set()
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index f288652d5155..d52b22c30bd4 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -7,8 +7,8 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.utils import set_random_seed
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (Logprob, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata, SequenceGroupOutput,
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SamplerOutput, SequenceData, SequenceGroupMetadata,
                            SequenceOutput)
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
@@ -170,7 +170,7 @@ def create_sampler_output_list(
 
     return [
         SamplerOutput(outputs=[
-            SequenceGroupOutput(
+            CompletionSequenceGroupOutput(
                 samples=[
                     SequenceOutput(
                         output_token=token_id,
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index b16bdc141e57..3f691f95a1f5 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -5,8 +5,9 @@
 
 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
-from vllm.sequence import (SamplerOutput, Sequence, SequenceData,
-                           SequenceGroup, SequenceGroupOutput, SequenceOutput)
+from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput,
+                           Sequence, SequenceData, SequenceGroup,
+                           SequenceOutput)
 
 
 def create_dummy_prompt(
@@ -36,10 +37,10 @@ def create_dummy_prompt(
 @pytest.fixture
 def sample_outputs():
     return [
-        SequenceGroupOutput(samples=[
+        CompletionSequenceGroupOutput(samples=[
             SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
         ],
-                            prompt_logprobs=None) for i in range(5)
+                                      prompt_logprobs=None) for i in range(5)
     ]
 
 
@@ -60,10 +61,10 @@ def test_sampler_output_getitem(sampler_output, sample_outputs):
 
 
 def test_sampler_output_setitem(sampler_output):
-    new_output = SequenceGroupOutput(samples=[
+    new_output = CompletionSequenceGroupOutput(samples=[
         SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
     ],
-                                     prompt_logprobs=None)
+                                               prompt_logprobs=None)
     sampler_output[2] = new_output
     assert sampler_output[2] == new_output
 
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 59810da3ca41..01e62ebbbe51 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -6,7 +6,8 @@
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.outputs import (CompletionOutput, CompletionRequestOutput,
+                          EmbeddingRequestOutput, RequestOutput)
 from vllm.sampling_params import SamplingParams
 
 __version__ = "0.4.2"
@@ -17,6 +18,8 @@
     "SamplingParams",
     "RequestOutput",
     "CompletionOutput",
+    "CompletionRequestOutput",
+    "EmbeddingRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",
diff --git a/vllm/config.py b/vllm/config.py
index a2cb9b32c65f..8ed6b2f916a5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -73,6 +73,8 @@ class ModelConfig:
             matches the model name exposed via the APIs. If multiple model 
             names provided, the first name will be used. If not specified, 
             the model name will be the same as `model`.
+        embedding_mode: Whether the running model is for embedding. It should
+            be used for embedding models.
     """
 
     def __init__(
@@ -95,6 +97,7 @@ def __init__(
         max_logprobs: int = 5,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
+        embedding_mode: Optional[bool] = False,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -115,6 +118,7 @@ def __init__(
                                        or max_context_len_to_capture)
         self.max_logprobs = max_logprobs
         self.skip_tokenizer_init = skip_tokenizer_init
+        self.embedding_mode = embedding_mode
 
         self.hf_config = get_config(self.model, trust_remote_code, revision,
                                     code_revision)
@@ -591,6 +595,7 @@ class SchedulerConfig:
             prompt latency) before scheduling next prompt.
         enable_chunked_prefill: If True, prefill requests can be chunked based
             on the remaining max_num_batched_tokens.
+        embedding_mode: Whether the running model is for embedding.
     """
 
     def __init__(
@@ -602,6 +607,7 @@ def __init__(
         num_lookahead_slots: int = 0,
         delay_factor: float = 0.0,
         enable_chunked_prefill: bool = False,
+        embedding_mode: Optional[bool] = False,
     ) -> None:
         if max_num_batched_tokens is not None:
             self.max_num_batched_tokens = max_num_batched_tokens
@@ -610,6 +616,9 @@ def __init__(
                 # It is the values that have the best balance between ITL
                 # and TTFT on A100. Note it is not optimized for throughput.
                 self.max_num_batched_tokens = 512
+            elif embedding_mode:
+                # For embedding, choose 32768 for higher throughput
+                self.max_num_batched_tokens = max(max_model_len, 32768)
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.
@@ -623,6 +632,7 @@ def __init__(
         self.num_lookahead_slots = num_lookahead_slots
         self.delay_factor = delay_factor
         self.chunked_prefill_enabled = enable_chunked_prefill
+        self.embedding_mode = embedding_mode
 
         self._verify_args()
 
diff --git a/vllm/core/block_manager_v3.py b/vllm/core/block_manager_v3.py
new file mode 100644
index 000000000000..0e6fb807bd4d
--- /dev/null
+++ b/vllm/core/block_manager_v3.py
@@ -0,0 +1,84 @@
+from typing import Dict, List
+
+from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+from vllm.sequence import Sequence, SequenceGroup
+
+
+class BlockSpaceManagerV3(BlockSpaceManager):
+    """A simple version of BlockSpaceManager for use in environments
+    where block management is not required.
+
+    This class provides the same interface as BlockSpaceManager, but its
+    methods perform no actions or return simple values like True in specific
+    actions. It's designed to be used in scenarios where the overhead of
+    block management is unnecessary, such as in an embedding environment.
+    """
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        pass
+
+    def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
+        # Always return OK for dummy purposes
+        return AllocStatus.OK
+
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        # No actual allocation logic needed
+        pass
+
+    def can_append_slots(self, seq_group: SequenceGroup,
+                         num_lookahead_slots: int) -> bool:
+        pass
+
+    def append_slots(
+        self,
+        seq: Sequence,
+        num_lookahead_slots: int,
+    ) -> Dict[int, List[int]]:
+        pass
+
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        pass
+
+    def can_swap_in(self, seq_group: SequenceGroup,
+                    num_lookahead_slots: int) -> bool:
+        return True
+
+    def swap_in(self, seq_group: SequenceGroup,
+                num_lookahead_slots: int) -> Dict[int, int]:
+        pass
+
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        return True
+
+    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
+        pass
+
+    def free(self, seq: Sequence) -> None:
+        # No operation on free
+        pass
+
+    def get_block_table(self, seq: Sequence) -> List[int]:
+        return []
+
+    def get_num_free_gpu_blocks(self) -> int:
+        return 1
+
+    def get_num_free_cpu_blocks(self) -> int:
+        return 1
+
+    def access_all_blocks_in_seq(
+        self,
+        seq: Sequence,
+        access_time: float,
+    ) -> None:
+        pass
+
+    def get_common_computed_block_ids(self,
+                                      seq_group: SequenceGroup) -> List[int]:
+        return []
+
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
+        pass
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index b2a5e41990f3..2e16f1dee350 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -35,6 +35,10 @@ def get_block_space_manager_class(version: str):
             from vllm.core.block_manager_v2 import BlockSpaceManagerV2
             return BlockSpaceManagerV2
 
+        if version == "v3":
+            from vllm.core.block_manager_v3 import BlockSpaceManagerV3
+            return BlockSpaceManagerV3
+
         raise ValueError(f"Unknown version {version=}")
 
     @abstractmethod
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 35e3db18f1c4..7319e05bb610 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -270,9 +270,14 @@ def __init__(
                 self.scheduler_config.max_model_len,
                 self.scheduler_config.max_num_batched_tokens)
 
+        version = "v1"
+        if self.scheduler_config.use_v2_block_manager:
+            version = "v2"
+        if self.scheduler_config.embedding_mode:
+            version = "v3"
+
         BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
-            version="v2" if self.scheduler_config.
-            use_v2_block_manager else "v1")
+            version)
 
         # Create the block space manager.
         self.block_manager = BlockSpaceManagerImpl(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5c2acbef1312..32d3d3cc7be5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,6 +77,7 @@ class EngineArgs:
     image_feature_size: Optional[int] = None
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
+    embedding_mode: bool = False
 
     guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
@@ -445,14 +446,12 @@ def add_cli_args(
             action='store_true',
             help='If set, the prefill requests can be chunked based on the '
             'max_num_batched_tokens.')
-
         parser.add_argument(
             '--speculative-model',
             type=nullable_str,
             default=EngineArgs.speculative_model,
             help=
             'The name of the draft model to be used in speculative decoding.')
-
         parser.add_argument(
             '--num-speculative-tokens',
             type=int,
@@ -497,7 +496,6 @@ def add_cli_args(
                             'corresponding to the chosen load_format. '
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
-
         parser.add_argument(
             "--served-model-name",
             nargs="+",
@@ -512,7 +510,6 @@ def add_cli_args(
             "will also be used in `model_name` tag content of "
             "prometheus metrics, if multiple names provided, metrics"
             "tag will take the first one.")
-
         return parser
 
     @classmethod
@@ -532,7 +529,8 @@ def create_engine_config(self, ) -> EngineConfig:
             self.quantization, self.quantization_param_path,
             self.enforce_eager, self.max_context_len_to_capture,
             self.max_seq_len_to_capture, self.max_logprobs,
-            self.skip_tokenizer_init, self.served_model_name)
+            self.skip_tokenizer_init, self.served_model_name,
+            self.embedding_mode)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
@@ -574,6 +572,7 @@ def create_engine_config(self, ) -> EngineConfig:
                                  speculative_config.num_lookahead_slots),
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
+            embedding_mode=model_config.embedding_mode,
         )
         lora_config = LoRAConfig(
             max_lora_rank=self.max_lora_rank,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 37a2dc77a3b5..86925972b9ab 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -14,7 +14,7 @@
 from vllm.executor.ray_utils import initialize_ray_cluster, ray
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionRequestOutput, EmbeddingRequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, MultiModalData, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
@@ -47,15 +47,18 @@ def _raise_exception_on_finish(
 
 
 class AsyncStream:
-    """A stream of RequestOutputs for a request that can be
-    iterated over asynchronously."""
+    """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
+    that can be iterated over asynchronously."""
 
     def __init__(self, request_id: str) -> None:
         self.request_id = request_id
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: Union[RequestOutput, Exception]) -> None:
+    def put(
+        self, item: Union[CompletionRequestOutput, EmbeddingRequestOutput,
+                          Exception]
+    ) -> None:
         if self._finished:
             return
         self._queue.put_nowait(item)
@@ -71,7 +74,8 @@ def finished(self) -> bool:
     def __aiter__(self):
         return self
 
-    async def __anext__(self) -> RequestOutput:
+    async def __anext__(
+            self) -> Union[CompletionRequestOutput, EmbeddingRequestOutput]:
         result = await self._queue.get()
         if isinstance(result, Exception):
             raise result
@@ -108,7 +112,8 @@ def propagate_exception(self,
                 self.abort_request(rid)
 
     def process_request_output(self,
-                               request_output: RequestOutput,
+                               request_output: Union[CompletionRequestOutput,
+                                                     EmbeddingRequestOutput],
                                *,
                                verbose: bool = False) -> None:
         """Process a request output from the engine."""
@@ -196,7 +201,9 @@ def has_new_requests(self):
 class _AsyncLLMEngine(LLMEngine):
     """Extension of LLMEngine to add async methods."""
 
-    async def step_async(self) -> List[RequestOutput]:
+    async def step_async(
+            self
+    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -579,7 +586,7 @@ async def generate(
         prompt_token_ids: Optional[List[int]] = None,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None
-    ) -> AsyncIterator[RequestOutput]:
+    ) -> AsyncIterator[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
         """Generate outputs for a request.
 
         Generate outputs for a request. This method is a coroutine. It adds the
@@ -597,8 +604,8 @@ async def generate(
             multi_modal_data: Multi modal data per request.
 
         Yields:
-            The output `RequestOutput` objects from the LLMEngine for the
-            request.
+            The output `RequestOutput` or `EmbeddingRequestOutput` objects
+            from the LLMEngine for the request.
 
         Details:
             - If the engine is not running, start the background loop,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b9938b045ba2..e982a1e3211a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -20,11 +20,14 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
+from vllm.outputs import (CompletionRequestOutput, EmbeddingRequestOutput,
+                          RequestOutputFactory)
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (ExecuteModelRequest, MultiModalData, SamplerOutput,
-                           Sequence, SequenceGroup, SequenceGroupMetadata,
-                           SequenceStatus)
+from vllm.sequence import (CompletionSequenceGroupOutput,
+                           EmbeddingSequenceGroupOutput, ExecuteModelRequest,
+                           MultiModalData, SamplerOutput, Sequence,
+                           SequenceGroup, SequenceStatus,
+                           SequenceGroupMetadata)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
@@ -169,7 +172,8 @@ def __init__(
             load_config=load_config,
         )
 
-        self._initialize_kv_caches()
+        if not self.model_config.embedding_mode:
+            self._initialize_kv_caches()
 
         # If usage stat is enabled, collect relevant info.
         if is_usage_stats_enabled():
@@ -484,13 +488,27 @@ def has_unfinished_requests(self) -> bool:
         """Returns True if there are unfinished requests."""
         return self.scheduler.has_unfinished_seqs()
 
+    def _process_sequence_group_outputs(
+        self, seq_group: SequenceGroup,
+        outputs: Union[CompletionSequenceGroupOutput,
+                       EmbeddingSequenceGroupOutput]
+    ) -> None:
+
+        if self.model_config.embedding_mode:
+            seq_group.embeddings = outputs.embeddings
+
+            for seq in seq_group.get_seqs():
+                seq.status = SequenceStatus.FINISHED_STOPPED
+
+            return
+
     def _process_model_outputs(
         self,
         output: List[SamplerOutput],
         scheduled_seq_groups: List[ScheduledSequenceGroup],
         ignored_seq_groups: List[SequenceGroup],
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> List[RequestOutput]:
+    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
         """Apply the model output to the sequences in the scheduled seq groups.
 
         Returns RequestOutputs that can be returned to the client.
@@ -510,6 +528,7 @@ def _process_model_outputs(
             seq_group = scheduled_seq_group.seq_group
             seq_group.update_num_computed_tokens(
                 scheduled_seq_group.token_chunk_size)
+            self._process_sequence_group_outputs(seq_group, outputs)
 
             self.output_processor.process_prompt_logprob(seq_group, outputs)
             if seq_group_meta.do_sample:
@@ -519,18 +538,21 @@ def _process_model_outputs(
         self.scheduler.free_finished_seq_groups()
 
         # Create the outputs.
-        request_outputs: List[RequestOutput] = []
+        request_outputs: List[Union[CompletionRequestOutput,
+                                    EmbeddingRequestOutput]] = []
         for scheduled_seq_group in scheduled_seq_groups:
             seq_group = scheduled_seq_group.seq_group
             seq_group.maybe_set_first_token_time(now)
-            request_output = RequestOutput.from_seq_group(seq_group)
+            request_output = RequestOutputFactory.create(seq_group)
             request_outputs.append(request_output)
         for seq_group in ignored_seq_groups:
-            request_output = RequestOutput.from_seq_group(seq_group)
+            request_output = RequestOutputFactory.create(seq_group)
             request_outputs.append(request_output)
         return request_outputs
 
-    def step(self) -> List[RequestOutput]:
+    def step(
+            self
+    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
         .. figure:: https://i.imgur.com/sv2HssD.png
@@ -637,12 +659,15 @@ def _get_stats(
 
         # KV Cache Usage in %
         num_total_gpu = self.cache_config.num_gpu_blocks
-        num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
-        gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
+        gpu_cache_usage_sys = 0.
+        if num_total_gpu is not None:
+            num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks(
+            )
+            gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
 
         num_total_cpu = self.cache_config.num_cpu_blocks
         cpu_cache_usage_sys = 0.
-        if num_total_cpu > 0:
+        if num_total_cpu is not None and num_total_cpu > 0:
             num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
             )
             cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 71620139fba3..f65584479592 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -7,7 +7,7 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionRequestOutput, EmbeddingRequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import MultiModalData
 from vllm.usage.usage_lib import UsageContext
@@ -75,6 +75,8 @@ class LLM:
             When a sequence has context length larger than this, we fall back
             to eager mode.
         disable_custom_all_reduce: See ParallelConfig
+        embedding_mode: Whether the running model is for embedding. It should
+            be used for embedding models.
     """
 
     def __init__(
@@ -96,6 +98,7 @@ def __init__(
         max_context_len_to_capture: Optional[int] = None,
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
+        embedding_mode: bool = False,
         **kwargs,
     ) -> None:
         if "disable_log_stats" not in kwargs:
@@ -118,6 +121,7 @@ def __init__(
             max_context_len_to_capture=max_context_len_to_capture,
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
+            embedding_mode=embedding_mode,
             **kwargs,
         )
         self.llm_engine = LLMEngine.from_engine_args(
@@ -143,7 +147,7 @@ def generate(
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
-    ) -> List[RequestOutput]:
+    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
         """Generates the completions for the input prompts.
 
         NOTE: This class automatically batches the given prompts, considering
@@ -234,7 +238,9 @@ def _add_request(
                                     lora_request=lora_request,
                                     multi_modal_data=multi_modal_data)
 
-    def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
+    def _run_engine(
+        self, use_tqdm: bool
+    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -245,7 +251,8 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
                 postfix=f"Generation Speed: {0:.2f} toks/s",
             )
         # Run the engine.
-        outputs: List[RequestOutput] = []
+        outputs: List[Union[CompletionRequestOutput,
+                            EmbeddingRequestOutput]] = []
         total_toks = 0
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 362f28d05c3b..1ec7c96f848c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -22,9 +22,11 @@
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               ChatCompletionResponse,
-                                              CompletionRequest, ErrorResponse)
+                                              CompletionRequest,
+                                              EmbeddingRequest, ErrorResponse)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 
@@ -32,6 +34,8 @@
 
 openai_serving_chat: OpenAIServingChat
 openai_serving_completion: OpenAIServingCompletion
+openai_serving_embedding: OpenAIServingEmbedding
+
 logger = init_logger(__name__)
 
 _running_tasks: Set[asyncio.Task] = set()
@@ -123,6 +127,17 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 
 
+@app.post("/v1/embeddings")
+async def create_embedding(request: EmbeddingRequest, raw_request: Request):
+    generator = await openai_serving_embedding.create_embedding(
+        request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        return JSONResponse(content=generator.model_dump())
+
+
 if __name__ == "__main__":
     args = parse_args()
 
@@ -190,7 +205,8 @@ async def authentication(request: Request, call_next):
                                             args.chat_template)
     openai_serving_completion = OpenAIServingCompletion(
         engine, model_config, served_model_names, args.lora_modules)
-
+    openai_serving_embedding = OpenAIServingEmbedding(engine,model_config,
+                                                      served_model_names)
     app.root_path = args.root_path
     uvicorn.run(app,
                 host=args.host,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3cd9ddad3b7b..8d83d6c652de 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -363,6 +363,14 @@ def check_guided_decoding_count(cls, data):
         return data
 
 
+class EmbeddingRequest(BaseModel):
+    model: str
+    input: Union[List[int], List[List[int]], str, List[str]]
+    encoding_format: Optional[str] = Field('float', pattern='^(float|base64)$')
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+
+
 class LogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -416,6 +424,21 @@ class CompletionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
+class EmbeddingResponseData(BaseModel):
+    index: int
+    object: str = "embedding"
+    embedding: List[float]
+
+
+class EmbeddingResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: List[EmbeddingResponseData]
+    usage: UsageInfo
+
+
 class ChatMessage(OpenAIBaseModel):
     role: str
     content: str
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1b469fc59b07..18bd4e2dfbc1 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -19,7 +19,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionRequestOutput
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -180,8 +180,8 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
 
     async def chat_completion_stream_generator(
             self, request: ChatCompletionRequest,
-            result_generator: AsyncIterator[RequestOutput], request_id: str,
-            conversation: List[ConversationMessage]
+            result_generator: AsyncIterator[CompletionRequestOutput],
+            request_id: str, conversation: List[ConversationMessage]
     ) -> AsyncGenerator[str, None]:
         model_name = self.served_model_names[0]
         created_time = int(time.time())
@@ -320,13 +320,13 @@ async def chat_completion_stream_generator(
 
     async def chat_completion_full_generator(
         self, request: ChatCompletionRequest, raw_request: Request,
-        result_generator: AsyncIterator[RequestOutput], request_id: str,
-        conversation: List[ConversationMessage]
+        result_generator: AsyncIterator[CompletionRequestOutput],
+        request_id: str, conversation: List[ConversationMessage]
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
         model_name = self.served_model_names[0]
         created_time = int(time.time())
-        final_res: Optional[RequestOutput] = None
+        final_res: Optional[CompletionRequestOutput] = None
 
         async for res in result_generator:
             if await raw_request.is_disconnected():
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 158d8ed7fbbf..e5b11201bc4c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -17,7 +17,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionRequestOutput
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
@@ -86,7 +86,7 @@ async def create_completion(self, request: CompletionRequest,
         created_time = int(time.time())
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncIterator[RequestOutput]] = []
+        generators: List[AsyncIterator[CompletionRequestOutput]] = []
         try:
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
@@ -130,7 +130,7 @@ async def create_completion(self, request: CompletionRequest,
             return self.create_error_response(str(e))
 
         result_generator: AsyncIterator[Tuple[
-            int, RequestOutput]] = merge_async_iterators(*generators)
+            int, CompletionRequestOutput]] = merge_async_iterators(*generators)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
         # results. In addition, we do not stream the results when use
@@ -150,7 +150,8 @@ async def create_completion(self, request: CompletionRequest,
                                                     num_prompts=len(prompts))
 
         # Non-streaming response
-        final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
+        final_res_batch: List[
+            Optional[CompletionRequestOutput]] = [None] * len(prompts)
         try:
             async for i, res in result_generator:
                 if await raw_request.is_disconnected():
@@ -181,7 +182,7 @@ async def completion_stream_generator(
         self,
         request: CompletionRequest,
         raw_request: Request,
-        result_generator: AsyncIterator[Tuple[int, RequestOutput]],
+        result_generator: AsyncIterator[Tuple[int, CompletionRequestOutput]],
         request_id: str,
         created_time: int,
         model_name: str,
@@ -277,7 +278,7 @@ async def completion_stream_generator(
 
     def request_output_to_completion_response(
         self,
-        final_res_batch: List[RequestOutput],
+        final_res_batch: List[CompletionRequestOutput],
         request: CompletionRequest,
         request_id: str,
         created_time: int,
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
new file mode 100644
index 000000000000..976965ea9bc0
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -0,0 +1,157 @@
+import asyncio
+import time
+from typing import AsyncIterator, List, Tuple
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.protocol import (EmbeddingRequest,
+                                              EmbeddingResponse,
+                                              EmbeddingResponseData, UsageInfo)
+from vllm.entrypoints.openai.serving_completion import parse_prompt_format
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.logger import init_logger
+from vllm.outputs import EmbeddingRequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+TypeTokenIDs = List[int]
+
+
+def request_output_to_embedding_response(
+    final_res_batch: List[EmbeddingRequestOutput],
+    request_id: str,
+    created_time: int,
+    model_name: str,
+) -> EmbeddingResponse:
+    data = []
+    num_prompt_tokens = 0
+    for idx, final_res in enumerate(final_res_batch):
+        assert final_res is not None
+        prompt_token_ids = final_res.prompt_token_ids
+
+        embedding_data = EmeddingResponseData(
+            index=idx, embedding=final_res.outputs.embedding)
+        data.append(embedding_data)
+
+        num_prompt_tokens += len(prompt_token_ids)
+
+    usage = UsageInfo(
+        prompt_tokens=num_prompt_tokens,
+        total_tokens=num_prompt_tokens,
+    )
+
+    return EmbeddingResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        data=data,
+        usage=usage,
+    )
+
+
+def merge_async_iterators(*iterators):
+    """Merge multiple asynchronous iterators into a single iterator.
+
+    This method handle the case where some iterators finish before others.
+    When it yields, it yields a tuple (i, item) where i is the index of the
+    iterator that yields the item.
+    """
+    queue = asyncio.Queue()
+
+    finished = [False] * len(iterators)
+
+    async def producer(i, iterator):
+        async for item in iterator:
+            await queue.put((i, item))
+        finished[i] = True
+
+    _tasks = [
+        asyncio.create_task(producer(i, iterator))
+        for i, iterator in enumerate(iterators)
+    ]
+
+    async def consumer():
+        while not all(finished) or not queue.empty():
+            item = await queue.get()
+            yield item
+        await asyncio.gather(*_tasks)
+
+    return consumer()
+
+
+class OpenAIServingEmbedding(OpenAIServing):
+
+    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
+                 served_model_names: List[str]):
+        super().__init__(engine=engine,
+                         model_config=model_config,
+                         served_model_names=served_model_names,
+                         lora_modules=None)
+
+    async def create_embedding(self, request: EmbeddingRequest,
+                               raw_request: Request):
+        """Completion API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/embeddings/create
+        for the API specification. This API mimics the OpenAI Embedding API.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        # Return error for unsupported features.
+        if request.encoding_format == "base64":
+            return self.create_error_response(
+                "base64 encoding is not currently supported")
+        if request.dimensions is not None:
+            return self.create_error_response(
+                "dimensions is currently not supported")
+
+        model_name = request.model
+        request_id = f"cmpl-{random_uuid()}"
+        created_time = int(time.monotonic())
+
+        # Schedule the request and get the result generator.
+        generators = []
+        try:
+            prompt_is_tokens, prompts = parse_prompt_format(request.input)
+
+            for i, prompt in enumerate(prompts):
+                if prompt_is_tokens:
+                    prompt_formats = self._validate_prompt_and_tokenize(
+                        request, prompt_ids=prompt)
+                else:
+                    prompt_formats = self._validate_prompt_and_tokenize(
+                        request, prompt=prompt)
+
+                prompt_ids, prompt_text = prompt_formats
+
+                generators.append(
+                    self.engine.generate(prompt_text,
+                                         SamplingParams(),
+                                         f"{request_id}-{i}",
+                                         prompt_token_ids=prompt_ids))
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        result_generator: AsyncIterator[Tuple[
+            int, EmbeddingRequestOutput]] = merge_async_iterators(*generators)
+
+        # Non-streaming response
+        final_res_batch: EmbeddingRequestOutput = [None] * len(prompts)
+        async for i, res in result_generator:
+            if await raw_request.is_disconnected():
+                # Abort the request if the client disconnects.
+                await self.engine.abort(f"{request_id}-{i}")
+                # TODO: Use a vllm-specific Validation Error
+                return self.create_error_response("Client disconnected")
+            final_res_batch[i] = res
+        response = request_output_to_embedding_response(
+            final_res_batch, request_id, created_time, model_name)
+
+        return response
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index f10718c5f3d8..58a1c2f7e73f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -9,7 +9,8 @@
 from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              CompletionRequest, ErrorResponse,
+                                              CompletionRequest,
+                                              EmbeddingRequest, ErrorResponse,
                                               LogProbs, ModelCard, ModelList,
                                               ModelPermission)
 from vllm.logger import init_logger
@@ -165,7 +166,8 @@ def _maybe_get_lora(
 
     def _validate_prompt_and_tokenize(
         self,
-        request: Union[ChatCompletionRequest, CompletionRequest],
+        request: Union[ChatCompletionRequest, CompletionRequest,
+                       EmbeddingRequest],
         prompt: Optional[str] = None,
         prompt_ids: Optional[List[int]] = None,
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
@@ -191,6 +193,16 @@ def _validate_prompt_and_tokenize(
             prompt_ids)
         token_num = len(input_ids)
 
+        # Note: EmbeddingRequest doesn't have max_tokens
+        if isinstance(request, EmbeddingRequest):
+            if token_num > self.max_model_len:
+                raise ValueError(
+                    f"This model's maximum context length is "
+                    f"{self.max_model_len} tokens. However, you requested "
+                    f"{token_num} tokens in the input for embedding "
+                    f"generation. Please reduce the length of the input.", )
+            return input_ids, input_text
+
         if request.max_tokens is None:
             if token_num >= self.max_model_len:
                 raise ValueError(
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index e52e350d2726..c8bab46c83ec 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -10,8 +10,9 @@
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
 from vllm.sampling_params import SamplingType
-from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs,
-                           SamplerOutput, SequenceGroupOutput, SequenceOutput)
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           PromptLogprobs, SampleLogprobs, SamplerOutput,
+                           SequenceOutput)
 
 # (num_token_ids, num_parent_ids) per sequence group.
 SampleResultType = List[Tuple[List[int], List[int]]]
@@ -1019,7 +1020,7 @@ def _build_sampler_output(
             seq_outputs.append(
                 SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
         sampler_output.append(
-            SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
+            CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs))
 
     # If not specified, store None values in SamplerOutput.
     if on_device_tensors is not None:
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index c5cdc059473b..82344c4d8e7c 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -9,7 +9,7 @@
 logger = init_logger(__name__)
 
 # Architecture -> (module, class).
-_MODELS = {
+_GENERATION_MODELS = {
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
@@ -57,6 +57,12 @@
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
 }
 
+_EMBEDDING_MODELS = {
+    "MistralModel": ("llama_embedding", "LlamaEmbeddingModel"),
+}
+
+_MODELS = {**_GENERATION_MODELS, **_EMBEDDING_MODELS}
+
 # Architecture -> type.
 # out of tree models
 _OOT_MODELS: Dict[str, Type[nn.Module]] = {}
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
new file mode 100644
index 000000000000..0b2a8791f087
--- /dev/null
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -0,0 +1,327 @@
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import LoRAConfig
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import EmbeddingSequenceGroupOutput, SamplerOutput
+
+
+class LlamaMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class LlamaAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+        bias: bool = False,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            linear_method=linear_method,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              sliding_window=sliding_window)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        sliding_window = getattr(config, "sliding_window", None)
+        self.self_attn = LlamaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+            bias=getattr(config, "bias", False),
+            sliding_window=sliding_window,
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class LlamaEmbeddingModel(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                attn_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def _last_token_pool(self, last_hidden_states: torch.Tensor,
+                         prompt_lens_tensor: torch.Tensor) -> torch.Tensor:
+        # Calculate cumulative lengths to get the start index of each sequence
+        # in the flattened tensor
+        cum_lengths = torch.cumsum(prompt_lens_tensor, dim=0)
+
+        # Calculate the flat indices for the last token of each sequence
+        last_token_flat_indices = cum_lengths - 1
+
+        # Ensure last_token_flat_indices is compatible for direct indexing
+        # Select the last tokens based on the calculated flat indices
+        last_tokens = last_hidden_states[last_token_flat_indices]
+
+        return last_tokens
+
+    def embedding(
+        self,
+        attn_metadata: AttentionMetadata,
+        hidden_states: torch.Tensor,
+    ) -> Optional[SamplerOutput]:
+        outputs = self._last_token_pool(hidden_states,
+                                        attn_metadata.prompt_lens_tensor)
+
+        seq_outputs = []
+        for output in outputs:
+            seq_outputs.append(
+                EmbeddingSequenceGroupOutput(embeddings=output.tolist()))
+        return SamplerOutput(outputs=seq_outputs)
+
+    def load_weights(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        load_format: str = "auto",
+        revision: Optional[str] = None,
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index d01be0eb0efd..c448c7eac773 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,4 +1,5 @@
 import time
+from abc import ABC, abstractmethod
 from typing import List, Optional, Union
 
 from vllm.lora.request import LoRARequest
@@ -57,8 +58,52 @@ def __repr__(self) -> str:
                 f"stop_reason={self.stop_reason})")
 
 
-class RequestOutput:
-    """The output data of a request to the LLM.
+class EmbeddingOutput:
+    """The output data of one completion output of a request.
+
+    Args:
+        embedding: The embedding vector, which is a list of floats. The
+        length of vector depends on the model as listed in the embedding guide.
+    """
+
+    def __init__(
+        self,
+        embedding: List[float],
+    ) -> None:
+        self.embedding = embedding
+
+    def __repr__(self) -> str:
+        return (f"EmbeddingOutput("
+                f"embedding={len(self.embedding)}")
+
+
+class RequestOutput(ABC):
+    """
+    An abstract base class representing the output of a request to the LLM.
+    The request could be a completion request or an embedding request.
+    """
+
+    def __init__(self, request_id: str, prompt_token_ids: List[int],
+                 finished: bool):
+        self.request_id = request_id
+        self.prompt_token_ids = prompt_token_ids
+        self.finished = finished
+
+    @abstractmethod
+    def from_seq_group(cls, seq_group: 'SequenceGroup') -> "RequestOutput":
+        """
+        A class method to initialize a RequestOutput (or its subclasses)
+        instance from a SequenceGroup.
+        """
+        pass
+
+    @abstractmethod
+    def __repr__(self) -> str:
+        pass
+
+
+class CompletionRequestOutput(RequestOutput):
+    """The output data of a completion request to the LLM.
 
     Args:
         request_id: The unique ID of the request.
@@ -82,17 +127,16 @@ def __init__(
         metrics: Optional[RequestMetrics] = None,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
-        self.request_id = request_id
+        super().__init__(request_id, prompt_token_ids, finished)
         self.prompt = prompt
-        self.prompt_token_ids = prompt_token_ids
         self.prompt_logprobs = prompt_logprobs
         self.outputs = outputs
-        self.finished = finished
         self.metrics = metrics
         self.lora_request = lora_request
 
     @classmethod
-    def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
+    def from_seq_group(cls,
+                       seq_group: SequenceGroup) -> "CompletionRequestOutput":
         seqs = seq_group.get_seqs()
         if len(seqs) == 1:
             top_n_seqs = seqs
@@ -140,7 +184,7 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
                    lora_request=seq_group.lora_request)
 
     def __repr__(self) -> str:
-        return (f"RequestOutput(request_id={self.request_id}, "
+        return (f"CompletionRequestOutput(request_id={self.request_id}, "
                 f"prompt={self.prompt!r}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
                 f"prompt_logprobs={self.prompt_logprobs}, "
@@ -148,3 +192,56 @@ def __repr__(self) -> str:
                 f"finished={self.finished}, "
                 f"metrics={self.metrics}, "
                 f"lora_request={self.lora_request})")
+
+
+class EmbeddingRequestOutput(RequestOutput):
+    """
+    The output data of an embedding request to the LLM.
+
+    Args:
+        request_id (str): A unique identifier for the embedding request.
+        outputs (EmbeddingOutput): The embedding results for the given input.
+        prompt_token_ids (List[int]): A list of token IDs used in the prompt.
+        finished (bool): A flag indicating whether the embedding is completed.
+    """
+
+    def __init__(self, request_id: str, outputs: 'EmbeddingOutput',
+                 prompt_token_ids: List[int], finished: bool):
+        super().__init__(request_id, prompt_token_ids, finished)
+        self.outputs = outputs
+
+    @classmethod
+    def from_seq_group(cls,
+                       seq_group: 'SequenceGroup') -> "EmbeddingRequestOutput":
+        output = EmbeddingOutput(seq_group.embeddings)
+        prompt_token_ids = seq_group.prompt_token_ids
+        finished = seq_group.is_finished()
+
+        return cls(seq_group.request_id, output, prompt_token_ids, finished)
+
+    def __repr__(self):
+        """
+        Returns a string representation of an EmbeddingRequestOutput instance.
+
+        The representation includes the request_id and the number of outputs,
+        providing a quick overview of the embedding request's results.
+
+        Returns:
+            str: A string representation of the EmbeddingRequestOutput instance.
+        """
+        return (f"EmbeddingRequestOutput(request_id='{self.request_id}', "
+                f"outputs={repr(self.outputs)}, "
+                f"prompt_token_ids={self.prompt_token_ids}, "
+                f"finished={self.finished})")
+
+
+class RequestOutputFactory:
+
+    @staticmethod
+    def create(seq_group):
+        # Determine the type based on a condition, for example:
+        if hasattr(seq_group,
+                   'embeddings') and seq_group.embeddings is not None:
+            return EmbeddingRequestOutput.from_seq_group(seq_group)
+        else:
+            return CompletionRequestOutput.from_seq_group(seq_group)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 3cebb85b49d2..9d16a847cc48 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,6 +1,7 @@
 """Sequence and its related classes."""
 import copy
 import enum
+from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
@@ -375,12 +376,12 @@ class SequenceGroupState:
 
 class MultiModalData:
     """Multi modal request.
-    
+
     Args:
         type: The data type.
         data: The actual data.
         The required shape and semantic meaning of it depends on the vision
-        language config of the hosted model. 
+        language config of the hosted model.
         See `VisionLanguageConfig` in `config.py`.
     """
 
@@ -402,6 +403,7 @@ class SequenceGroup:
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
         multi_modal_data: Multi modal data associated with the request.
+        embeddings: The embeddings vectors of the prompt of the sequence group.
     """
 
     def __init__(
@@ -412,6 +414,7 @@ def __init__(
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
+        embeddings: Optional[List[float]] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -425,6 +428,7 @@ def __init__(
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
         self.multi_modal_data = multi_modal_data
+        self.embeddings = embeddings
 
     @property
     def prompt(self) -> str:
@@ -669,8 +673,20 @@ def __eq__(self, other: object) -> bool:
         return equal and log_probs_equal
 
 
-class SequenceGroupOutput:
-    """The model output associated with a sequence group."""
+class SequenceGroupOutput(ABC):
+    """The base class for model outputs associated with a sequence group."""
+
+    @abstractmethod
+    def __repr__(self) -> str:
+        pass
+
+    @abstractmethod
+    def __eq__(self, other: object) -> bool:
+        pass
+
+
+class CompletionSequenceGroupOutput(SequenceGroupOutput):
+    """The model output associated with a completion sequence group."""
 
     def __init__(
         self,
@@ -682,26 +698,46 @@ def __init__(
         self.prompt_logprobs = prompt_logprobs
 
     def __repr__(self) -> str:
-        return (f"SequenceGroupOutput(samples={self.samples}, "
+        return (f"CompletionSequenceGroupOutput(samples={self.samples}, "
                 f"prompt_logprobs={self.prompt_logprobs})")
 
     def __eq__(self, other: object) -> bool:
-        if not isinstance(other, SequenceGroupOutput):
+        if not isinstance(other, CompletionSequenceGroupOutput):
             raise NotImplementedError()
         return (self.samples == other.samples
                 and self.prompt_logprobs == other.prompt_logprobs)
 
 
+class EmbeddingSequenceGroupOutput(SequenceGroupOutput):
+    """The model output associated with an embedding sequence group."""
+
+    def __init__(
+        self,
+        embeddings: List[float],
+    ) -> None:
+        self.embeddings = embeddings
+
+    def __repr__(self) -> str:
+        return (f"EmbeddingSequenceGroupOutput("
+                f"embeddings_shape={len(self.embeddings)})")
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, EmbeddingSequenceGroupOutput):
+            raise NotImplementedError()
+        return self.embeddings == other.embeddings
+
+
 @dataclass
 class SamplerOutput:
     """For each sequence group, we generate a list of SequenceOutput object,
     each of which contains one possible candidate for the next token.
 
-    This datastructure implements methods so it can be used like a list, but
+    This data structure implements methods, so it can be used like a list, but
     also has optional fields for device tensors.
     """
 
-    outputs: List[SequenceGroupOutput]
+    outputs: List[Union[CompletionSequenceGroupOutput,
+                        EmbeddingSequenceGroupOutput]]
 
     # On-device tensor containing probabilities of each token.
     sampled_token_probs: Optional["torch.Tensor"] = None
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index a4e759095b29..d232393f0fa5 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -5,8 +5,9 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
-                           SequenceGroupMetadata)
+from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
+                           Logprob, SamplerOutput, SequenceGroupMetadata,
+                           SequenceOutput)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index d6f80c82b80b..4dc6c49eb58d 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -4,7 +4,8 @@
 
 import torch
 
-from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata,
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SamplerOutput, SequenceGroupMetadata,
                            SequenceGroupOutput, SequenceOutput)
 
 SeqId = int
@@ -94,7 +95,7 @@ def create_sequence_group_output(
         for topk_logprob_index, _ in enumerate(topk_token_ids)
     })
 
-    return SequenceGroupOutput(
+    return CompletionSequenceGroupOutput(
         samples=[
             SequenceOutput(parent_seq_id=seq_id,
                            output_token=token_id,
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
new file mode 100644
index 000000000000..1e4950e7dccb
--- /dev/null
+++ b/vllm/worker/embedding_model_runner.py
@@ -0,0 +1,70 @@
+from typing import List, Optional
+
+import torch
+
+from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.model_runner import ModelRunner
+
+logger = init_logger(__name__)
+
+
+class EmbeddingModelRunner(ModelRunner):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+    ):
+        super().__init__(model_config,
+                         parallel_config,
+                         scheduler_config,
+                         device_config,
+                         lora_config=lora_config,
+                         kv_cache_dtype=kv_cache_dtype,
+                         is_driver_worker=is_driver_worker,
+                         vision_language_config=vision_language_config)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        kv_caches: List[torch.Tensor],
+    ) -> Optional[SamplerOutput]:
+        (input_tokens, input_positions, attn_metadata, _, lora_requests,
+         lora_mapping, multi_modal_input
+         ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        if self.lora_config:
+            self.set_active_loras(lora_requests, lora_mapping)
+
+        # Execute the model.
+        if attn_metadata.use_cuda_graph:
+            graph_batch_size = input_tokens.shape[0]
+            model_executable = self.graph_runners[graph_batch_size]
+        else:
+            model_executable = self.model
+
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        kv_caches = [None] * num_layers
+
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        return self.model.embedding(attn_metadata=attn_metadata,
+                                    hidden_states=hidden_states)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 46c6730645c1..e3e5c3d09a5d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,7 +1,7 @@
 import contextlib
 import time
 from enum import IntEnum
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple
+from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -310,9 +310,10 @@ def _prepare_prompt(
                 multi_modal_input_list.append(
                     seq_group_metadata.multi_modal_data.data)
 
-            if seq_group_metadata.block_tables is None:
+            if _is_block_tables_empty(seq_group_metadata.block_tables):
                 # During memory profiling, the block tables are not initialized
                 # yet. In this case, we just use a dummy slot mapping.
+                # In embeddings, the block tables are {seq_id: None}.
                 slot_mapping.extend([_PAD_SLOT_ID] * seq_len)
                 continue
 
@@ -827,7 +828,6 @@ def profile_run(self) -> None:
         sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
         max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
         max_num_seqs = self.scheduler_config.max_num_seqs
-
         # This represents the maximum number of different requests
         # that will have unique loras, an therefore the max amount of memory
         # consumption create dummy lora request copies from the lora request
@@ -1166,3 +1166,15 @@ def _prepare_fake_inputs(
         prompt_tokens = [0] * seq_len
         fake_image_input = None
     return SequenceData(prompt_tokens), fake_image_input
+
+
+def _is_block_tables_empty(block_tables: Union[None, Dict]):
+    """
+    Check if block_tables is None or a dictionary with all None values.
+    """
+    if block_tables is None:
+        return True
+    if isinstance(block_tables, dict) and all(
+            not value for value in block_tables.values()):
+        return True
+    return False
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 313bcf25d887..3161a04bae45 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -20,6 +20,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.model_runner import ModelRunner
 from vllm.worker.worker_base import WorkerBase
 
@@ -70,7 +71,9 @@ def __init__(
             assert not self.lora_config, (
                 "To be tested: vision language model with LoRA settings.")
 
-        self.model_runner = ModelRunner(
+        ModelRunnerClass = (EmbeddingModelRunner if
+                            self.model_config.embedding_mode else ModelRunner)
+        self.model_runner = ModelRunnerClass(
             model_config,
             parallel_config,
             scheduler_config,

From 473449be5b8be785f06270f2ba9cb4ff67bcfa0e Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 17 Apr 2024 21:16:08 -0700
Subject: [PATCH 02/41] Make LlamaEmbeddingModel generate normalized embeddings

Add _check_embedding_mode to OpenAIServingEmbedding
---
 vllm/entrypoints/openai/serving_embedding.py  | 10 +++++++++-
 vllm/model_executor/models/llama_embedding.py |  3 ++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 976965ea9bc0..10879a1829e2 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -33,7 +33,7 @@ def request_output_to_embedding_response(
         assert final_res is not None
         prompt_token_ids = final_res.prompt_token_ids
 
-        embedding_data = EmeddingResponseData(
+        embedding_data = EmbeddingResponseData(
             index=idx, embedding=final_res.outputs.embedding)
         data.append(embedding_data)
 
@@ -91,6 +91,7 @@ def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=None)
+        self._check_embedding_mode(embedding_mode)
 
     async def create_embedding(self, request: EmbeddingRequest,
                                raw_request: Request):
@@ -155,3 +156,10 @@ async def create_embedding(self, request: EmbeddingRequest,
             final_res_batch, request_id, created_time, model_name)
 
         return response
+
+    def _check_embedding_mode(self, embedding_mode: bool):
+        if not embedding_mode:
+            logger.warning(
+                "embedding_mode is False. Embedding API will not work.")
+        else:
+            logger.info("Activating the server engine with embedding enabled.")
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index 0b2a8791f087..c7c7f5063620 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -274,9 +274,10 @@ def embedding(
     ) -> Optional[SamplerOutput]:
         outputs = self._last_token_pool(hidden_states,
                                         attn_metadata.prompt_lens_tensor)
+        outputs_normalized = nn.functional.normalize(outputs, p=2, dim=1)
 
         seq_outputs = []
-        for output in outputs:
+        for output in outputs_normalized:
             seq_outputs.append(
                 EmbeddingSequenceGroupOutput(embeddings=output.tolist()))
         return SamplerOutput(outputs=seq_outputs)

From 07fc3045d52dd4fa05a741ba0ea5fa5ba785fad3 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 23 Apr 2024 16:23:03 -0700
Subject: [PATCH 03/41] Rename BlockSpaceManagerV3 to
 EmbeddingModelBlockSpaceManager

---
 ...lock_manager_v3.py => embedding_model_block_manager.py} | 6 +++---
 vllm/core/interfaces.py                                    | 7 ++++---
 vllm/core/scheduler.py                                     | 3 ++-
 3 files changed, 9 insertions(+), 7 deletions(-)
 rename vllm/core/{block_manager_v3.py => embedding_model_block_manager.py} (91%)

diff --git a/vllm/core/block_manager_v3.py b/vllm/core/embedding_model_block_manager.py
similarity index 91%
rename from vllm/core/block_manager_v3.py
rename to vllm/core/embedding_model_block_manager.py
index 0e6fb807bd4d..2b672f983de1 100644
--- a/vllm/core/block_manager_v3.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -4,9 +4,9 @@
 from vllm.sequence import Sequence, SequenceGroup
 
 
-class BlockSpaceManagerV3(BlockSpaceManager):
-    """A simple version of BlockSpaceManager for use in environments
-    where block management is not required.
+class EmbeddingModelBlockSpaceManager(BlockSpaceManager):
+    """An embedding version of BlockSpaceManager for use in environments
+    with embedding models where block management is not required.
 
     This class provides the same interface as BlockSpaceManager, but its
     methods perform no actions or return simple values like True in specific
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 2e16f1dee350..689cbc2179ee 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -35,9 +35,10 @@ def get_block_space_manager_class(version: str):
             from vllm.core.block_manager_v2 import BlockSpaceManagerV2
             return BlockSpaceManagerV2
 
-        if version == "v3":
-            from vllm.core.block_manager_v3 import BlockSpaceManagerV3
-            return BlockSpaceManagerV3
+        if version == "embedding":
+            from vllm.core.embedding_model_block_manager import (
+                EmbeddingModelBlockSpaceManager)
+            return EmbeddingModelBlockSpaceManager
 
         raise ValueError(f"Unknown version {version=}")
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 7319e05bb610..fb6e985b2f31 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -274,7 +274,7 @@ def __init__(
         if self.scheduler_config.use_v2_block_manager:
             version = "v2"
         if self.scheduler_config.embedding_mode:
-            version = "v3"
+            version = "embedding"
 
         BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
             version)
@@ -973,6 +973,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 do_sample=do_sample,
+                pooling_params=seq_group.pooling_params,
                 token_chunk_size=token_chunk_size,
                 lora_request=seq_group.lora_request,
                 computed_block_nums=common_computed_block_nums,

From f8fdd4f50ddd06da5992a878663cb2a62c1af487 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 23 Apr 2024 16:24:52 -0700
Subject: [PATCH 04/41] Clean up LlamaEmbeddingModel

- Add Pooler, PoolerOutput
- Reuse LlamaModel for LlamaEmbeddingModel
- Add PoolingParams to SequenceGroup
---
 vllm/model_executor/layers/pooler.py          |  54 +++
 vllm/model_executor/models/llama.py           |  45 +++
 vllm/model_executor/models/llama_embedding.py | 314 ++----------------
 vllm/sequence.py                              |  37 ++-
 4 files changed, 154 insertions(+), 296 deletions(-)
 create mode 100644 vllm/model_executor/layers/pooler.py

diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
new file mode 100644
index 000000000000..af4c177bc2a2
--- /dev/null
+++ b/vllm/model_executor/layers/pooler.py
@@ -0,0 +1,54 @@
+from enum import IntEnum
+
+import torch
+import torch.nn as nn
+
+from vllm.attention import AttentionMetadata
+from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
+
+
+class PoolingType(IntEnum):
+    """Enumeration for different types of pooling methods."""
+    LAST = 0
+
+
+class Pooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+
+    Attributes:
+        pooling_type: The type of pooling to use (LAST, AVERAGE, MAX).
+        normalize: Whether to normalize the pooled data.
+    """
+
+    def __init__(self, pooling_type: PoolingType, normalize: bool):
+        super().__init__()
+        self.pooling_type = pooling_type
+        self.normalize = normalize
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_metadata: AttentionMetadata,
+    ) -> PoolerOutput:
+        """Pools specific information from hidden states based on metadata."""
+        prompt_lens = attention_metadata.prompt_lens_tensor
+
+        if self.pooling_type == PoolingType.LAST:
+            last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
+            pooled_data = hidden_states[last_token_flat_indices]
+        else:
+            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+
+        if self.normalize:
+            pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+
+        pooled_outputs = [
+            EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data
+        ]
+
+        return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f6d7fc8733fc..fdcd24ba2e5f 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -298,6 +298,51 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        load_format: str = "auto",
+        revision: Optional[str] = None,
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
 
 class LlamaForCausalLM(nn.Module):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index c7c7f5063620..de6bc16bf6c3 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -1,204 +1,27 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import List, Optional
 
 import torch
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import AttentionMetadata
 from vllm.config import LoRAConfig
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import EmbeddingSequenceGroupOutput, SamplerOutput
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.sequence import PoolerOutput
 
 
-class LlamaMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class LlamaAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-        bias: bool = False,
-        sliding_window: Optional[int] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=bias,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=bias,
-            linear_method=linear_method,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              sliding_window=sliding_window)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class LlamaDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        sliding_window = getattr(config, "sliding_window", None)
-        self.self_attn = LlamaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-            bias=getattr(config, "bias", False),
-            sliding_window=sliding_window,
-        )
-        self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
+class LlamaEmbeddingModel(nn.Module):
+    """A model that uses Llama with additional embedding functionalities.
 
+   This class encapsulates the LlamaModel and provides an interface for
+   embedding operations and customized pooling functions.
 
-class LlamaEmbeddingModel(nn.Module):
+   Attributes:
+       model: An instance of LlamaModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
 
     def __init__(
         self,
@@ -207,25 +30,8 @@ def __init__(
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
+        self.model = LlamaModel(config, linear_method, lora_config)
+        self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
 
     def forward(
         self,
@@ -235,52 +41,15 @@ def forward(
         attn_metadata: AttentionMetadata,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            hidden_states = inputs_embeds
-        else:
-            hidden_states = self.get_input_embeddings(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                attn_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
+        return self.model.forward(input_ids, positions, kv_caches,
+                                  attn_metadata, inputs_embeds)
 
-    def _last_token_pool(self, last_hidden_states: torch.Tensor,
-                         prompt_lens_tensor: torch.Tensor) -> torch.Tensor:
-        # Calculate cumulative lengths to get the start index of each sequence
-        # in the flattened tensor
-        cum_lengths = torch.cumsum(prompt_lens_tensor, dim=0)
-
-        # Calculate the flat indices for the last token of each sequence
-        last_token_flat_indices = cum_lengths - 1
-
-        # Ensure last_token_flat_indices is compatible for direct indexing
-        # Select the last tokens based on the calculated flat indices
-        last_tokens = last_hidden_states[last_token_flat_indices]
-
-        return last_tokens
-
-    def embedding(
+    def pooler(
         self,
-        attn_metadata: AttentionMetadata,
         hidden_states: torch.Tensor,
-    ) -> Optional[SamplerOutput]:
-        outputs = self._last_token_pool(hidden_states,
-                                        attn_metadata.prompt_lens_tensor)
-        outputs_normalized = nn.functional.normalize(outputs, p=2, dim=1)
-
-        seq_outputs = []
-        for output in outputs_normalized:
-            seq_outputs.append(
-                EmbeddingSequenceGroupOutput(embeddings=output.tolist()))
-        return SamplerOutput(outputs=seq_outputs)
+        attention_metadata: AttentionMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, attention_metadata)
 
     def load_weights(
         self,
@@ -289,40 +58,5 @@ def load_weights(
         load_format: str = "auto",
         revision: Optional[str] = None,
     ):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+        self.model.load_weights(model_name_or_path, cache_dir, load_format,
+                                revision)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 9d16a847cc48..4529cc5cec01 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -7,6 +7,7 @@
 
 from vllm.block import LogicalTokenBlock
 from vllm.lora.request import LoRARequest
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
@@ -404,17 +405,19 @@ class SequenceGroup:
         lora_request: LoRA request.
         multi_modal_data: Multi modal data associated with the request.
         embeddings: The embeddings vectors of the prompt of the sequence group.
+        pooling_params: The pooling parameters used to generate the pooling.
     """
 
     def __init__(
         self,
         request_id: str,
         seqs: List[Sequence],
-        sampling_params: SamplingParams,
         arrival_time: float,
+        sampling_params: Optional[SamplingParams] = None,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
         embeddings: Optional[List[float]] = None,
+        pooling_params: Optional[PoolingParams] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -429,6 +432,7 @@ def __init__(
         self.state = SequenceGroupState()
         self.multi_modal_data = multi_modal_data
         self.embeddings = embeddings
+        self.pooling_params = pooling_params
 
     @property
     def prompt(self) -> str:
@@ -483,12 +487,13 @@ def set_finished_time(self, time: Optional[float]) -> None:
     def get_max_num_running_seqs(self) -> int:
         """The maximum number of sequences running in parallel in the remaining
         lifetime of the request."""
-        if self.sampling_params.use_beam_search:
+        if self.sampling_params and self.sampling_params.use_beam_search:
             # For beam search, maximally there will always be `best_of` beam
             # candidates running in the future.
             return self.sampling_params.best_of
         else:
-            if self.sampling_params.best_of > self.num_seqs():
+            if (self.sampling_params
+                    and self.sampling_params.best_of > self.num_seqs()):
                 # At prompt stage, the sequence group is not yet filled up
                 # and only have one sequence running. However, in the
                 # generation stage, we will have `best_of` sequences running.
@@ -559,7 +564,7 @@ def is_finished(self) -> bool:
         return all(seq.is_finished() for seq in self.get_seqs())
 
     def is_prefill(self) -> bool:
-        # Every sequences should be in the same stage.
+        # Every sequence should be in the same stage.
         return self.get_seqs()[0].is_prefill()
 
     def __repr__(self) -> str:
@@ -598,6 +603,7 @@ def __init__(
         sampling_params: SamplingParams,
         block_tables: Dict[int, List[int]],
         do_sample: bool = True,
+        pooling_params: Optional[PoolingParams] = None,
         token_chunk_size: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
@@ -609,6 +615,7 @@ def __init__(
         self.seq_data = seq_data
         self.sampling_params = sampling_params
         self.block_tables = block_tables
+        self.pooling_params = pooling_params
         self.lora_request = lora_request
         self.computed_block_nums = computed_block_nums
         self.multi_modal_data = multi_modal_data
@@ -736,8 +743,7 @@ class SamplerOutput:
     also has optional fields for device tensors.
     """
 
-    outputs: List[Union[CompletionSequenceGroupOutput,
-                        EmbeddingSequenceGroupOutput]]
+    outputs: List[CompletionSequenceGroupOutput]
 
     # On-device tensor containing probabilities of each token.
     sampled_token_probs: Optional["torch.Tensor"] = None
@@ -806,3 +812,22 @@ def clone(
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
         )
+
+
+@dataclass
+class PoolerOutput:
+    """The output from a pooling operation in the Llama model."""
+    outputs: List[EmbeddingSequenceGroupOutput]
+
+    def __getitem__(self, idx: int):
+        return self.outputs[idx]
+
+    def __setitem__(self, idx: int, value):
+        self.outputs[idx] = value
+
+    def __len__(self):
+        return len(self.outputs)
+
+    def __eq__(self, other: object):
+        return isinstance(other,
+                          self.__class__) and self.outputs == other.outputs

From 8af04f2d95770447c57a823d545ab615670d30d7 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 23 Apr 2024 16:26:02 -0700
Subject: [PATCH 05/41] Use ModelRegistry to enable ModelConfig.embedding_mode

- Remove embedding_mode in engine args
- Use re to search for XXXModel to match embedding models
---
 vllm/config.py                               | 10 ++++++----
 vllm/engine/arg_utils.py                     |  4 +---
 vllm/entrypoints/openai/api_server.py        |  4 ++--
 vllm/entrypoints/openai/serving_embedding.py |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 8ed6b2f916a5..8248a565c8dd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -73,8 +73,6 @@ class ModelConfig:
             matches the model name exposed via the APIs. If multiple model 
             names provided, the first name will be used. If not specified, 
             the model name will be the same as `model`.
-        embedding_mode: Whether the running model is for embedding. It should
-            be used for embedding models.
     """
 
     def __init__(
@@ -97,7 +95,6 @@ def __init__(
         max_logprobs: int = 5,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
-        embedding_mode: Optional[bool] = False,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -118,7 +115,6 @@ def __init__(
                                        or max_context_len_to_capture)
         self.max_logprobs = max_logprobs
         self.skip_tokenizer_init = skip_tokenizer_init
-        self.embedding_mode = embedding_mode
 
         self.hf_config = get_config(self.model, trust_remote_code, revision,
                                     code_revision)
@@ -130,6 +126,7 @@ def __init__(
                                                        served_model_name)
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
+        self.embedding_mode = self._check_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
 
@@ -215,6 +212,11 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
+    def _check_embedding_mode(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        pattern = r".*Model$"
+        return any(re.match(pattern, arch) for arch in architectures)
+
     def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 32d3d3cc7be5..14a751ad2811 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,7 +77,6 @@ class EngineArgs:
     image_feature_size: Optional[int] = None
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
-    embedding_mode: bool = False
 
     guided_decoding_backend: str = 'outlines'
     # Speculative decoding configuration.
@@ -529,8 +528,7 @@ def create_engine_config(self, ) -> EngineConfig:
             self.quantization, self.quantization_param_path,
             self.enforce_eager, self.max_context_len_to_capture,
             self.max_seq_len_to_capture, self.max_logprobs,
-            self.skip_tokenizer_init, self.served_model_name,
-            self.embedding_mode)
+            self.skip_tokenizer_init, self.served_model_name)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1ec7c96f848c..b7711d9e783f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -205,8 +205,8 @@ async def authentication(request: Request, call_next):
                                             args.chat_template)
     openai_serving_completion = OpenAIServingCompletion(
         engine, model_config, served_model_names, args.lora_modules)
-    openai_serving_embedding = OpenAIServingEmbedding(engine,model_config,
-                                                      served_model_names)
+    openai_serving_embedding = OpenAIServingEmbedding(
+        engine, model_config, served_model_names)
     app.root_path = args.root_path
     uvicorn.run(app,
                 host=args.host,
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 10879a1829e2..a4d3ef9511c7 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -91,7 +91,7 @@ def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
                          model_config=model_config,
                          served_model_names=served_model_names,
                          lora_modules=None)
-        self._check_embedding_mode(embedding_mode)
+        self._check_embedding_mode(model_config.embedding_mode)
 
     async def create_embedding(self, request: EmbeddingRequest,
                                raw_request: Request):

From e937412930d318e57a73b30352ff2886a8102923 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 23 Apr 2024 16:32:49 -0700
Subject: [PATCH 06/41] Separating PoolerOutput, PoolingParams from SamplingXXX

- Update SequenceGroup initialization in tests/
- Use PoolingParams and PoolerOutput for embedding_model_runner
- Add PoolingMetadata
---
 tests/core/test_block_manager.py             |  14 ++-
 tests/core/utils.py                          |  11 +-
 tests/test_sequence.py                       |  11 +-
 vllm/core/embedding_model_block_manager.py   |  10 +-
 vllm/entrypoints/openai/protocol.py          |  13 ++-
 vllm/entrypoints/openai/serving_embedding.py |  32 +-----
 vllm/executor/gpu_executor.py                |   8 +-
 vllm/model_executor/pooling_metadata.py      |  32 ++++++
 vllm/pooling_params.py                       |  20 ++++
 vllm/worker/embedding_model_runner.py        | 108 +++++++++++++++++--
 vllm/worker/worker.py                        |   6 +-
 11 files changed, 202 insertions(+), 63 deletions(-)
 create mode 100644 vllm/model_executor/pooling_metadata.py
 create mode 100644 vllm/pooling_params.py

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 9db58e075196..a4996b5ab097 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -142,8 +142,11 @@ def test_append_slot_cow():
     child = prompt.fork(new_seq_id=2)
 
     # Allocate space for the sequence group.
-    seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
-                              time.time(), time.perf_counter)
+    seq_group = SequenceGroup(request_id="1",
+                              seqs=[prompt, child],
+                              arrival_time=time.time(),
+                              sampling_params=SamplingParams(),
+                              lora_request=time.perf_counter)
     block_manager.allocate(seq_group)
 
     # Fork and append a new token id. We expect a COW to be scheduled.
@@ -303,8 +306,11 @@ def test_sliding_window_multi_seq():
     assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
 
     parent = Sequence(1, "one two three", [0, 1, 2], block_size)
-    seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
-                              None)
+    seq_group = SequenceGroup(request_id="1",
+                              seqs=[parent],
+                              arrival_time=time.time(),
+                              sampling_params=SamplingParams(),
+                              lora_request=None)
     block_manager.allocate(seq_group)
 
     # assert the number of blocks allocated is correct
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 22c1d3826dff..8fb13177a2d6 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -22,10 +22,13 @@ def create_dummy_prompt(
     prompt_tokens = list(range(prompt_length))
     prompt_str = " ".join([str(t) for t in prompt_tokens])
     prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
-    seq_group = SequenceGroup(
-        request_id, [prompt],
-        SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
-        time.time(), lora_request)
+    seq_group = SequenceGroup(request_id=request_id,
+                              seqs=[prompt],
+                              arrival_time=time.time(),
+                              sampling_params=SamplingParams(
+                                  use_beam_search=use_beam_search,
+                                  best_of=best_of),
+                              lora_request=lora_request)
 
     return prompt, seq_group
 
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 3f691f95a1f5..b5429bf771e0 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -26,10 +26,13 @@ def create_dummy_prompt(
     prompt_tokens = list(range(prompt_length))
     prompt_str = " ".join([str(t) for t in prompt_tokens])
     prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
-    seq_group = SequenceGroup(
-        request_id, [prompt],
-        SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
-        time.time(), lora_request)
+    seq_group = SequenceGroup(request_id=request_id,
+                              seqs=[prompt],
+                              arrival_time=time.time(),
+                              sampling_params=SamplingParams(
+                                  use_beam_search=use_beam_search,
+                                  best_of=best_of),
+                              lora_request=lora_request)
 
     return seq_group
 
diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py
index 2b672f983de1..f17c0ac03ae6 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -30,14 +30,14 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
     def can_append_slots(self, seq_group: SequenceGroup,
                          num_lookahead_slots: int) -> bool:
-        pass
+        return True
 
     def append_slots(
         self,
         seq: Sequence,
         num_lookahead_slots: int,
     ) -> Dict[int, List[int]]:
-        pass
+        return {}
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         pass
@@ -48,17 +48,17 @@ def can_swap_in(self, seq_group: SequenceGroup,
 
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
-        pass
+        return {}
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         return True
 
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
-        pass
+        return {}
 
     def free(self, seq: Sequence) -> None:
         # No operation on free
-        pass
+        return
 
     def get_block_table(self, seq: Sequence) -> List[int]:
         return []
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 8d83d6c652de..139c5716c7ce 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,13 +1,14 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
-from typing import Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import torch
 from openai.types.chat import ChatCompletionMessageParam
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Annotated
 
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
@@ -364,12 +365,22 @@ def check_guided_decoding_count(cls, data):
 
 
 class EmbeddingRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings
     model: str
     input: Union[List[int], List[List[int]], str, List[str]]
     encoding_format: Optional[str] = Field('float', pattern='^(float|base64)$')
     dimensions: Optional[int] = None
     user: Optional[str] = None
 
+    # doc: begin-embedding-pooling-params
+    additional_data: Optional[Any] = None
+
+    # doc: end-embedding-pooling-params
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
 
 class LogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index a4d3ef9511c7..934fb861d935 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -14,7 +14,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import EmbeddingRequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.utils import random_uuid
+from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
 
@@ -53,36 +53,6 @@ def request_output_to_embedding_response(
     )
 
 
-def merge_async_iterators(*iterators):
-    """Merge multiple asynchronous iterators into a single iterator.
-
-    This method handle the case where some iterators finish before others.
-    When it yields, it yields a tuple (i, item) where i is the index of the
-    iterator that yields the item.
-    """
-    queue = asyncio.Queue()
-
-    finished = [False] * len(iterators)
-
-    async def producer(i, iterator):
-        async for item in iterator:
-            await queue.put((i, item))
-        finished[i] = True
-
-    _tasks = [
-        asyncio.create_task(producer(i, iterator))
-        for i, iterator in enumerate(iterators)
-    ]
-
-    async def consumer():
-        while not all(finished) or not queue.empty():
-            item = await queue.get()
-            yield item
-        await asyncio.gather(*_tasks)
-
-    return consumer()
-
-
 class OpenAIServingEmbedding(OpenAIServing):
 
     def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index fa3480fa6483..47abdfca4556 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,9 +1,9 @@
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -124,7 +124,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
 
     def execute_model(
             self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+            execute_model_req: ExecuteModelRequest) -> List[Union[SamplerOutput, PoolerOutput]]:
         output = self.driver_worker.execute_model(execute_model_req)
         return output
 
@@ -150,7 +150,7 @@ class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
     async def execute_model_async(
         self,
         execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
+    ) -> List[Union[SamplerOutput, PoolerOutput]]:
         output = await make_async(self.driver_worker.execute_model
                                   )(execute_model_req=execute_model_req, )
         return output
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
new file mode 100644
index 000000000000..e34308bf40ba
--- /dev/null
+++ b/vllm/model_executor/pooling_metadata.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, List, Tuple
+
+from vllm.pooling_params import PoolingParams
+
+
+class PoolingMetadata:
+    """Metadata for pooling operations in the Pooler layer.
+
+    This class holds the necessary information for pooling operations,
+    providing context for how to perform pooling and other related operations.
+
+    Attributes:
+        seq_groups: List of (seq_ids, pooling_params).
+        seq_data: A mapping of sequence ID to additional sequence data.
+        prompt_lens: List of the lengths of each prompt.
+    """
+
+    def __init__(
+        self,
+        seq_groups: List[Tuple[List[int], PoolingParams]],
+        seq_data: Dict[int, Any],  # Specific data related to sequences
+        prompt_lens: List[int],
+    ) -> None:
+        self.seq_groups = seq_groups
+        self.seq_data = seq_data
+        self.prompt_lens = prompt_lens
+
+    def __repr__(self) -> str:
+        return ("PoolingMetadata("
+                f"seq_groups={self.seq_groups}, "
+                f"seq_data={self.seq_data}, "
+                f"prompt_lens={self.prompt_lens}, ")
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
new file mode 100644
index 000000000000..3b95d73ddc2c
--- /dev/null
+++ b/vllm/pooling_params.py
@@ -0,0 +1,20 @@
+from typing import Any, Optional
+
+
+class PoolingParams:
+    """Pooling parameters for pooling.
+
+    Attributes:
+        additional_data: Any additional data needed for pooling.
+    """
+
+    def __init__(self, additional_data: Optional[Any] = None):
+        self.additional_data = additional_data
+
+    def clone(self) -> "PoolingParams":
+        """Returns a deep copy of the PoolingParams instance."""
+        return PoolingParams(additional_data=self.additional_data, )
+
+    def __repr__(self) -> str:
+        return (f"PoolingParams("
+                f"additional_metadata={self.additional_data})")
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 1e4950e7dccb..05f74696d9d6 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -1,11 +1,17 @@
-from typing import List, Optional
+from typing import Dict, List, Optional, Set, Tuple
 
 import torch
 
+from vllm.attention import AttentionMetadata
 from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig, VisionLanguageConfig)
 from vllm.logger import init_logger
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.lora.layers import LoRAMapping
+from vllm.model_executor.parallel_utils.communication_op import (
+    broadcast_tensor_dict)
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.pooling_params import PoolingParams
+from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata
 from vllm.worker.model_runner import ModelRunner
 
 logger = init_logger(__name__)
@@ -38,9 +44,9 @@ def execute_model(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, attn_metadata, _, lora_requests,
-         lora_mapping, multi_modal_input
+    ) -> Optional[PoolerOutput]:
+        (input_tokens, input_positions, attn_metadata, pooling_metadata,
+         lora_requests, lora_mapping, multi_modal_input
          ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
@@ -66,5 +72,93 @@ def execute_model(
             execute_model_kwargs.update({"image_input": multi_modal_input})
         hidden_states = model_executable(**execute_model_kwargs)
 
-        return self.model.embedding(attn_metadata=attn_metadata,
-                                    hidden_states=hidden_states)
+        return self.model.pooler(hidden_states=hidden_states,
+                                 attention_metadata=attn_metadata)
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
+               Set[int], LoRAMapping, torch.Tensor]:
+        if self.is_driver_worker:
+            # NOTE: We assume that all sequences in the group are all prompts or
+            # all decodes.
+            is_prompt = seq_group_metadata_list[0].is_prompt
+            # Prepare input tensors.
+            if is_prompt:
+                (input_tokens, input_positions, attn_metadata, prompt_lens,
+                 subquery_lens, lora_index_mapping, lora_prompt_mapping,
+                 lora_requests, multi_modal_input
+                 ) = self._prepare_prompt(seq_group_metadata_list)
+            else:
+                logger.warning(
+                    "Embedding model should not have non-prompt inputs.")
+                (input_tokens, input_positions, attn_metadata,
+                 lora_index_mapping, lora_prompt_mapping,
+                 lora_requests) = self._prepare_decode(seq_group_metadata_list)
+                prompt_lens = []
+                multi_modal_input = None
+            pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                     prompt_lens)
+
+            if self.lora_config:
+                lora_mapping = LoRAMapping(
+                    lora_index_mapping,
+                    lora_prompt_mapping,
+                )
+            else:
+                lora_mapping = None
+
+            # Broadcast the metadata.
+            metadata_dict = {
+                "input_tokens": input_tokens,
+                "input_positions": input_positions,
+                "lora_requests": lora_requests,
+                "lora_mapping": lora_mapping,
+                "multi_modal_input": multi_modal_input,
+                "prompt_lens": pooling_metadata.prompt_lens,
+            }
+            metadata_dict.update(attn_metadata.asdict_zerocopy())
+            broadcast_tensor_dict(metadata_dict, src=0)
+        else:
+            metadata_dict = broadcast_tensor_dict(src=0)
+            print(metadata_dict)
+            input_tokens = metadata_dict.pop("input_tokens")
+            input_positions = metadata_dict.pop("input_positions")
+            lora_mapping = metadata_dict.pop("lora_mapping")
+            lora_requests = metadata_dict.pop("lora_requests")
+            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
+
+            prompt_lens = metadata_dict.pop("prompt_lens")
+
+            pooling_metadata = PoolingMetadata(seq_groups=None,
+                                               seq_data=None,
+                                               prompt_lens=None)
+
+        return (input_tokens, input_positions, attn_metadata, pooling_metadata,
+                lora_requests, lora_mapping, multi_modal_input)
+
+    def _prepare_pooling(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+    ) -> PoolingMetadata:
+        """Prepare PoolingMetadata for the sequence group metadata list."""
+        seq_groups: List[Tuple[List[int], PoolingParams]] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            pooling_params = seq_group_metadata.pooling_params
+            seq_groups.append((seq_ids, pooling_params))
+
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+
+        pooling_metadata = PoolingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+        )
+
+        return pooling_metadata
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 3161a04bae45..c3fc1ef6bd4c 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,7 +1,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.distributed
@@ -18,7 +18,7 @@
     init_custom_ar)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.model_runner import ModelRunner
@@ -214,7 +214,7 @@ def cache_swap(
     def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+    ) -> List[Union[SamplerOutput, PoolerOutput]]:
 
         if execute_model_req is None:
             seq_group_metadata_list = None

From e59c6a5dc6ea8e91a66157684f9349d3f9268e92 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 23 Apr 2024 16:37:09 -0700
Subject: [PATCH 07/41] Separating LLM.encode() from LLM.generate()

- Add LLM.encode()
- Add encode() in AsyncLLMEngine
---
 vllm/__init__.py                             |   2 +
 vllm/engine/async_llm_engine.py              | 142 ++++++++++++++++---
 vllm/engine/llm_engine.py                    |  97 ++++++++++---
 vllm/entrypoints/llm.py                      | 138 ++++++++++++++----
 vllm/entrypoints/openai/serving_embedding.py |   1 +
 5 files changed, 317 insertions(+), 63 deletions(-)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 01e62ebbbe51..f30fad73e03a 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -8,6 +8,7 @@
 from vllm.model_executor.models import ModelRegistry
 from vllm.outputs import (CompletionOutput, CompletionRequestOutput,
                           EmbeddingRequestOutput, RequestOutput)
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
 __version__ = "0.4.2"
@@ -25,4 +26,5 @@
     "AsyncLLMEngine",
     "AsyncEngineArgs",
     "initialize_ray_cluster",
+    "PoolingParams",
 ]
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 86925972b9ab..f2c9d0689f21 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -15,6 +15,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import CompletionRequestOutput, EmbeddingRequestOutput
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, MultiModalData, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
@@ -277,8 +278,8 @@ async def add_request_async(
 
         return self.add_request(request_id,
                                 prompt=prompt,
-                                prompt_token_ids=prompt_token_ids,
                                 sampling_params=sampling_params,
+                                prompt_token_ids=prompt_token_ids,
                                 arrival_time=arrival_time,
                                 lora_request=lora_request,
                                 multi_modal_data=multi_modal_data)
@@ -518,7 +519,7 @@ async def add_request(
         self,
         request_id: str,
         prompt: Optional[str],
-        sampling_params: SamplingParams,
+        params: Union[SamplingParams, PoolingParams],
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -535,9 +536,9 @@ async def add_request(
                                                               max_log_len]
             logger.info(
                 "Received request %s: prompt: %r, "
-                "sampling_params: %s, prompt_token_ids: %s, "
-                "lora_request: %s.", request_id, shortened_prompt,
-                sampling_params, shortened_token_ids, lora_request)
+                "params: %s, prompt_token_ids: %s, "
+                "lora_request: %s.", request_id, shortened_prompt, params,
+                shortened_token_ids, lora_request)
 
         if not self.is_running:
             if self.start_engine_loop:
@@ -569,7 +570,7 @@ async def add_request(
         stream = self._request_tracker.add_request(
             request_id,
             prompt=prompt,
-            sampling_params=sampling_params,
+            params=params,
             prompt_token_ids=prompt_token_ids,
             arrival_time=arrival_time,
             lora_request=lora_request,
@@ -586,7 +587,7 @@ async def generate(
         prompt_token_ids: Optional[List[int]] = None,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None
-    ) -> AsyncIterator[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+    ) -> AsyncIterator[CompletionRequestOutput]:
         """Generate outputs for a request.
 
         Generate outputs for a request. This method is a coroutine. It adds the
@@ -650,25 +651,128 @@ async def generate(
             >>> # Process and return the final output
             >>> ...
         """
-        # Preprocess the request.
-        arrival_time = time.time()
-
-        try:
-            stream = await self.add_request(
+        async for output in self.process_request(
                 request_id,
                 prompt,
                 sampling_params,
-                prompt_token_ids=prompt_token_ids,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                multi_modal_data=multi_modal_data,
-            )
+                prompt_token_ids,
+                None,  # No arrival time
+                lora_request,
+                multi_modal_data,
+        ):
+            yield output
+
+    async def encode(
+        self,
+        prompt: Optional[str],
+        pooling_params: PoolingParams,
+        request_id: str,
+        prompt_token_ids: Optional[List[int]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None
+    ) -> AsyncIterator[EmbeddingRequestOutput]:
+        """Generate outputs for a request.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        Args:
+            prompt: The prompt string. Can be None if prompt_token_ids is
+                provided.
+            pooling_params: The pooling parameters of the request.
+            request_id: The unique id of the request.
+            prompt_token_ids: The token IDs of the prompt. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            lora_request: LoRA request to use for generation, if any.
+            multi_modal_data: Multi modal data per request.
+
+        Yields:
+            The output `RequestOutput` or `EmbeddingRequestOutput` objects
+            from the LLMEngine for the request.
+
+        Details:
+            - If the engine is not running, start the background loop,
+              which iteratively invokes
+              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+              to process the waiting requests.
+            - Add the request to the engine's `RequestTracker`.
+              On the next background loop, this request will be sent to
+              the underlying engine.
+              Also, a corresponding `AsyncStream` will be created.
+            - Wait for the request outputs from `AsyncStream` and yield them.
+
+        Example:
+            >>> # Please refer to entrypoints/api_server.py for
+            >>> # the complete example.
+            >>>
+            >>> # initialize the engine and the example input
+            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
+            >>> example_input = {
+            >>>     "input": "What is LLM?",
+            >>>     "request_id": 0,
+            >>> }
+            >>>
+            >>> # start the generation
+            >>> results_generator = engine.encode(
+            >>>    example_input["input"],
+            >>>    PoolingParams(),
+            >>>    example_input["request_id"])
+            >>>
+            >>> # get the results
+            >>> final_output = None
+            >>> async for request_output in results_generator:
+            >>>     if await request.is_disconnected():
+            >>>         # Abort the request if the client disconnects.
+            >>>         await engine.abort(request_id)
+            >>>         # Return or raise an error
+            >>>         ...
+            >>>     final_output = request_output
+            >>>
+            >>> # Process and return the final output
+            >>> ...
+        """
+        async for output in self.process_request(
+                request_id,
+                prompt,
+                pooling_params,
+                prompt_token_ids,
+                None,
+                lora_request,
+                multi_modal_data,
+        ):
+            yield output
+
+    async def process_request(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        params: Union[SamplingParams, PoolingParams],
+        prompt_token_ids: Optional[List[int]] = None,
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> AsyncIterator[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+        """Common logic to process requests with SamplingParams or
+        PoolingParams."""
+        # Preprocess the request and set default arrival time if not provided
+        if arrival_time is None:
+            arrival_time = time.time()
 
+        stream = await self.add_request(
+            request_id,
+            prompt,
+            params,
+            prompt_token_ids=prompt_token_ids,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            multi_modal_data=multi_modal_data,
+        )
+
+        try:
             async for request_output in stream:
                 yield request_output
         except (Exception, asyncio.CancelledError) as e:
-            # If there is an exception or coroutine is cancelled, abort the
-            # request.
             self._abort(request_id)
             raise e
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e982a1e3211a..fff73079c36d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -22,10 +22,11 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import (CompletionRequestOutput, EmbeddingRequestOutput,
                           RequestOutputFactory)
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (CompletionSequenceGroupOutput,
-                           EmbeddingSequenceGroupOutput, ExecuteModelRequest,
-                           MultiModalData, SamplerOutput, Sequence,
+                           EmbeddingSequenceGroupOutput, ExecuteModelRequest, MultiModalData,
+                           PoolerOutput, SamplerOutput, Sequence,
                            SequenceGroup, SequenceStatus,
                            SequenceGroupMetadata)
 from vllm.transformers_utils.detokenizer import Detokenizer
@@ -358,7 +359,7 @@ def add_request(
         self,
         request_id: str,
         prompt: Optional[str],
-        sampling_params: SamplingParams,
+        params: Union[SamplingParams, PoolingParams],
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -374,7 +375,8 @@ def add_request(
             request_id: The unique ID of the request.
             prompt: The prompt string. Can be None if prompt_token_ids is
                 provided.
-            sampling_params: The sampling parameters for text generation.
+            params: Parameters for sampling or pooling. SamplingParams
+                for text generation. PoolingParams for pooling.
             prompt_token_ids: The token IDs of the prompt. If None, we
                 use the tokenizer to convert the prompts to token IDs.
             arrival_time: The arrival time of the request. If None, we use
@@ -408,13 +410,6 @@ def add_request(
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
-        max_logprobs = self.get_model_config().max_logprobs
-        if (sampling_params.logprobs
-                and sampling_params.logprobs > max_logprobs) or (
-                    sampling_params.prompt_logprobs
-                    and sampling_params.prompt_logprobs > max_logprobs):
-            raise ValueError(f"Cannot request more than "
-                             f"{max_logprobs} logprobs.")
         if arrival_time is None:
             arrival_time = time.time()
         prompt_token_ids = self.encode_request(
@@ -436,6 +431,50 @@ def add_request(
         seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
                        eos_token_id, lora_request)
 
+        # Create a SequenceGroup based on SamplingParams or PoolingParams
+        if isinstance(params, SamplingParams):
+            seq_group = self._create_sequence_group_with_sampling(
+                request_id,
+                seq,
+                params,
+                arrival_time,
+                lora_request,
+                multi_modal_data,
+            )
+        elif isinstance(params, PoolingParams):
+            seq_group = self._create_sequence_group_with_pooling(
+                request_id,
+                seq,
+                params,
+                arrival_time,
+                lora_request,
+                multi_modal_data,
+            )
+        else:
+            raise ValueError(
+                "Either SamplingParams or PoolingParams must be provided.")
+
+        # Add the sequence group to the scheduler.
+        self.scheduler.add_seq_group(seq_group)
+
+    def _create_sequence_group_with_sampling(
+        self,
+        request_id: str,
+        seq: Sequence,
+        sampling_params: SamplingParams,
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> SequenceGroup:
+        """Creates a SequenceGroup with SamplingParams."""
+        max_logprobs = self.get_model_config().max_logprobs
+        if (sampling_params.logprobs
+                and sampling_params.logprobs > max_logprobs) or (
+                    sampling_params.prompt_logprobs
+                    and sampling_params.prompt_logprobs > max_logprobs):
+            raise ValueError(f"Cannot request more than "
+                             f"{max_logprobs} logprobs.")
+
         # Defensive copy of SamplingParams, which are used by the sampler,
         # this doesn't deep-copy LogitsProcessor objects
         sampling_params = sampling_params.clone()
@@ -447,11 +486,35 @@ def add_request(
             self.generation_config_fields)
 
         # Create the sequence group.
-        seq_group = SequenceGroup(request_id, [seq], sampling_params,
-                                  arrival_time, lora_request, multi_modal_data)
+        seq_group = SequenceGroup(request_id=request_id,
+                                  seqs=[seq],
+                                  arrival_time=arrival_time,
+                                  sampling_params=sampling_params,
+                                  lora_request=lora_request,
+                                  multi_modal_data=multi_modal_data)
 
-        # Add the sequence group to the scheduler.
-        self.scheduler.add_seq_group(seq_group)
+        return seq_group
+
+    def _create_sequence_group_with_pooling(
+        self,
+        request_id: str,
+        seq: Sequence,
+        pooling_params: PoolingParams,
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> SequenceGroup:
+        """Creates a SequenceGroup with PoolingParams."""
+        # Defensive copy of PoolingParams, which are used by the pooler
+        pooling_params = pooling_params.clone()
+        # Create the sequence group.
+        seq_group = SequenceGroup(request_id=request_id,
+                                  seqs=[seq],
+                                  arrival_time=arrival_time,
+                                  lora_request=lora_request,
+                                  multi_modal_data=multi_modal_data,
+                                  pooling_params=pooling_params)
+        return seq_group
 
     def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
         """Aborts a request(s) with the given ID.
@@ -504,7 +567,7 @@ def _process_sequence_group_outputs(
 
     def _process_model_outputs(
         self,
-        output: List[SamplerOutput],
+        output: List[Union[SamplerOutput, PoolerOutput]],
         scheduled_seq_groups: List[ScheduledSequenceGroup],
         ignored_seq_groups: List[SequenceGroup],
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -592,7 +655,7 @@ def step(
             >>> while True:
             >>>     if example_inputs:
             >>>         req_id, prompt, sampling_params = example_inputs.pop(0)
-            >>>         engine.add_request(str(req_id), prompt, sampling_params)
+            >>>         engine.add_request(str(req_id),prompt,sampling_params)
             >>>
             >>>     # continue the request processing
             >>>     request_outputs = engine.step()
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f65584479592..a7bd3c79c27f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -6,13 +6,17 @@
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import CompletionRequestOutput, EmbeddingRequestOutput
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import MultiModalData
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter
 
+logger = init_logger(__name__)
+
 
 class LLM:
     """An LLM for generating texts from given prompts and sampling parameters.
@@ -75,8 +79,6 @@ class LLM:
             When a sequence has context length larger than this, we fall back
             to eager mode.
         disable_custom_all_reduce: See ParallelConfig
-        embedding_mode: Whether the running model is for embedding. It should
-            be used for embedding models.
     """
 
     def __init__(
@@ -98,7 +100,6 @@ def __init__(
         max_context_len_to_capture: Optional[int] = None,
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
-        embedding_mode: bool = False,
         **kwargs,
     ) -> None:
         if "disable_log_stats" not in kwargs:
@@ -121,7 +122,6 @@ def __init__(
             max_context_len_to_capture=max_context_len_to_capture,
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
-            embedding_mode=embedding_mode,
             **kwargs,
         )
         self.llm_engine = LLMEngine.from_engine_args(
@@ -147,7 +147,7 @@ def generate(
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
-    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+    ) -> List[CompletionRequestOutput]:
         """Generates the completions for the input prompts.
 
         NOTE: This class automatically batches the given prompts, considering
@@ -168,8 +168,89 @@ def generate(
             multi_modal_data: Multi modal data.
 
         Returns:
-            A list of `RequestOutput` objects containing the generated
-            completions in the same order as the input prompts.
+            A list of `CompletionRequestOutput` objects containing the
+            generated completions in the same order as the input prompts.
+        """
+        if sampling_params is None:
+            # Use default sampling params.
+            sampling_params = SamplingParams()
+
+        requests_data = self._validate_and_prepare_requests(
+            prompts,
+            sampling_params,
+            prompt_token_ids,
+            lora_request,
+            multi_modal_data,
+        )
+
+        # Add requests to the engine and run the engine
+        for request_data in requests_data:
+            self._add_request(**request_data)
+
+        return self._run_engine(use_tqdm)
+
+    def encode(
+        self,
+        prompts: Optional[Union[str, List[str]]] = None,
+        pooling_params: Optional[Union[PoolingParams,
+                                       List[PoolingParams]]] = None,
+        prompt_token_ids: Optional[List[List[int]]] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> List[EmbeddingRequestOutput]:
+        """Generates the completions for the input prompts.
+
+        NOTE: This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: A list of prompts to generate completions for.
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
+            prompt_token_ids: A list of token IDs for the prompts. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            multi_modal_data: Multi modal data.
+
+        Returns:
+            A list of `EmbeddingRequestOutput` objects containing the
+            generated embeddings in the same order as the input prompts.
+        """
+        if pooling_params is None:
+            # Use default pooling params.
+            pooling_params = PoolingParams()
+
+        requests_data = self._validate_and_prepare_requests(
+            prompts,
+            pooling_params,
+            prompt_token_ids,
+            lora_request,
+            multi_modal_data,
+        )
+
+        # Add requests to the engine and run the engine
+        for request_data in requests_data:
+            self._add_request(**request_data)
+
+        return self._run_engine(use_tqdm)
+
+    def _validate_and_prepare_requests(
+        self,
+        prompts: Optional[Union[str, List[str]]],
+        params: Union[Union[SamplingParams, PoolingParams],
+                      List[Union[SamplingParams,
+                                 PoolingParams]]],  # Unified parameter
+        prompt_token_ids: Optional[List[List[int]]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> List[dict]:
+        """Validates and prepares request data for adding to the engine.
+
+        Ensures prompts and token IDs are consistent, and returns a list of
+        dictionaries with request data for further processing.
         """
         if prompts is None and prompt_token_ids is None:
             raise ValueError("Either prompts or prompt_token_ids must be "
@@ -192,40 +273,43 @@ def generate(
             assert prompt_token_ids is not None
             num_requests = len(prompt_token_ids)
 
-        if sampling_params is None:
-            # Use default sampling params.
-            sampling_params = SamplingParams()
-
-        elif isinstance(sampling_params,
-                        list) and len(sampling_params) != num_requests:
-            raise ValueError("The lengths of prompts and sampling_params "
+        if isinstance(params, list) and len(params) != num_requests:
+            raise ValueError("The lengths of prompts and params "
                              "must be the same.")
         if multi_modal_data:
             multi_modal_data.data = multi_modal_data.data.to(torch.float16)
 
         # Add requests to the engine.
+        requests_data = []
         for i in range(num_requests):
             prompt = prompts[i] if prompts is not None else None
             token_ids = None if prompt_token_ids is None else prompt_token_ids[
                 i]
-            self._add_request(
+
+            multi_modal_item = MultiModalData(
+                type=multi_modal_data.type,
+                data=multi_modal_data.data[i].unsqueeze(0),
+            ) if multi_modal_data else None
+
+            requests_data.append({
+                "prompt":
                 prompt,
-                sampling_params[i]
-                if isinstance(sampling_params, list) else sampling_params,
+                "params":
+                params[i] if isinstance(params, list) else params,
+                "prompt_token_ids":
                 token_ids,
-                lora_request=lora_request,
-                # Get ith image while maintaining the batch dim.
-                multi_modal_data=MultiModalData(
-                    type=multi_modal_data.type,
-                    data=multi_modal_data.data[i].unsqueeze(0))
-                if multi_modal_data else None,
-            )
-        return self._run_engine(use_tqdm)
+                "lora_request":
+                lora_request,
+                "multi_modal_data":
+                multi_modal_item,
+            })
+
+        return requests_data
 
     def _add_request(
         self,
         prompt: Optional[str],
-        sampling_params: SamplingParams,
+        params: Union[SamplingParams, PoolingParams],
         prompt_token_ids: Optional[List[int]],
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
@@ -233,7 +317,7 @@ def _add_request(
         request_id = str(next(self.request_counter))
         self.llm_engine.add_request(request_id,
                                     prompt,
-                                    sampling_params,
+                                    params,
                                     prompt_token_ids,
                                     lora_request=lora_request,
                                     multi_modal_data=multi_modal_data)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 934fb861d935..b0c69a23cedd 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -90,6 +90,7 @@ async def create_embedding(self, request: EmbeddingRequest,
         generators = []
         try:
             prompt_is_tokens, prompts = parse_prompt_format(request.input)
+            pooling_params = request.to_pooling_params()
 
             for i, prompt in enumerate(prompts):
                 if prompt_is_tokens:

From 79aa97107e6186a7c064d1ec0e629f419656af9c Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 23 Apr 2024 16:37:35 -0700
Subject: [PATCH 08/41] Add tests for LlamaEmbeddingModel and OpenaiAPI server
 embedding

---
 tests/conftest.py                       | 35 +++++++++++----
 tests/entrypoints/test_openai_server.py | 58 +++++++++++++++++++++++++
 tests/models/test_llama_embedding.py    | 47 ++++++++++++++++++++
 3 files changed, 132 insertions(+), 8 deletions(-)
 create mode 100644 tests/models/test_llama_embedding.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 1f2ad1cbd729..d6af7378c8c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 from PIL import Image
+from sentence_transformers import SentenceTransformer
 from transformers import (AutoModelForCausalLM, AutoProcessor,
                           LlavaForConditionalGeneration)
 
@@ -133,6 +134,10 @@ def example_long_prompts() -> List[str]:
     "llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
 }
 
+_EMBEDDING_MODELS = [
+    "intfloat/e5-mistral-7b-instruct",
+]
+
 
 class HfRunner:
 
@@ -145,14 +150,7 @@ def __init__(
         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
         self.model_name = model_name
-        if model_name not in _VISION_LANGUAGE_MODELS:
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-            ).cuda()
-            self.processor = None
-        else:
+        if model_name in _VISION_LANGUAGE_MODELS:
             self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
@@ -162,6 +160,16 @@ def __init__(
                 model_name,
                 torch_dtype=torch_dtype,
             )
+        elif model_name in _EMBEDDING_MODELS:
+            print("using sentence transformer")
+            self.model = SentenceTransformer(model_name, ).cuda()
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
+            self.processor = None
         if tokenizer_name is None:
             tokenizer_name = model_name
         self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
@@ -334,6 +342,9 @@ def generate_greedy_logprobs_limit(
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
 
+    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
+        return self.model.encode(prompts)
+
     def __del__(self):
         del self.model
         cleanup()
@@ -459,6 +470,14 @@ def generate_beam_search(
         outputs = self.generate(prompts, beam_search_params)
         return outputs
 
+    def encode(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.encode(prompts)
+        outputs = []
+        for req_output in req_outputs:
+            embedding = req_output.outputs.embedding
+            outputs.append(embedding)
+        return outputs
+
     def __del__(self):
         del self.model
         cleanup()
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index e53e64a0c1ff..7d3b6de329c1 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -23,6 +23,7 @@
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
@@ -121,6 +122,24 @@ def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
+@pytest.fixture(scope="session")
+def embedding_server(zephyr_lora_files):
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        EMBEDDING_MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+
+
 @pytest.fixture(scope="session")
 def server(zephyr_lora_files):
     ray.init()
@@ -461,6 +480,45 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
     assert texts[0] == texts[1]
 
 
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_embedding(embedding_server, client: openai.AsyncOpenAI,
+                         model_name: str):
+    input = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 9
+    assert embeddings.usage.total_tokens == 9
+
+    # test using token IDs
+    input = [1, 1, 1, 1, 1]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
 async def test_logits_bias(server, client: openai.AsyncOpenAI):
     prompt = "Hello, my name is"
     max_tokens = 5
diff --git a/tests/models/test_llama_embedding.py b/tests/models/test_llama_embedding.py
new file mode 100644
index 000000000000..868b0d0c42d6
--- /dev/null
+++ b/tests/models/test_llama_embedding.py
@@ -0,0 +1,47 @@
+"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+
+Run `pytest tests/models/test_llama_embedding.py`.
+"""
+import pytest
+import torch
+import torch.nn.functional as F
+
+MODELS = [
+    "intfloat/e5-mistral-7b-instruct",
+]
+
+
+def compare_embeddings(embeddings1, embeddings2):
+    similarities = [
+        F.cosine_similarity(torch.tensor(e1), torch.tensor(e2), dim=0)
+        for e1, e2 in zip(embeddings1, embeddings2)
+    ]
+    return similarities
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+# @pytest.mark.skip(
+#     "Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
+#     "scalar type BFloat16 but found Half (only in CI).")
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.encode(example_prompts)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype, max_model_len=32768)
+    vllm_outputs = vllm_model.encode(example_prompts)
+    del vllm_model
+
+    similarities = compare_embeddings(hf_outputs, vllm_outputs)
+    all_similarities = torch.stack(similarities)
+    tolerance = 1e-2
+    assert torch.all((all_similarities <= 1.0 + tolerance)
+                     & (all_similarities >= 1.0 - tolerance)
+                     ), f"Not all values are within {tolerance} of 1.0"

From f002d3c72ddf6597294761109f1a1dbade8983cf Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 23 Apr 2024 22:24:52 -0700
Subject: [PATCH 09/41] Fix errors caused by rebase

- Update execute_model and prepare_input_tensors in
embedding_model_runner.py to follow model_runner.py
= Remove unused imports
---
 vllm/engine/async_llm_engine.py               |   4 +-
 vllm/engine/llm_engine.py                     |  23 ++-
 vllm/entrypoints/openai/serving_embedding.py  |   4 +-
 vllm/model_executor/layers/pooler.py          |   2 +-
 vllm/model_executor/models/llama.py           |  11 +-
 vllm/model_executor/models/llama_embedding.py |  13 +-
 vllm/worker/embedding_model_runner.py         | 165 ++++++++++++++----
 vllm/worker/worker.py                         |   3 +-
 8 files changed, 152 insertions(+), 73 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index f2c9d0689f21..c7d1c22ce9f0 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -259,7 +259,7 @@ async def add_request_async(
         self,
         request_id: str,
         prompt: Optional[str],
-        sampling_params: SamplingParams,
+        params: Union[SamplingParams, PoolingParams],
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -278,7 +278,7 @@ async def add_request_async(
 
         return self.add_request(request_id,
                                 prompt=prompt,
-                                sampling_params=sampling_params,
+                                params=params,
                                 prompt_token_ids=prompt_token_ids,
                                 arrival_time=arrival_time,
                                 lora_request=lora_request,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index fff73079c36d..95de2cb12538 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -24,8 +24,7 @@
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (CompletionSequenceGroupOutput,
-                           EmbeddingSequenceGroupOutput, ExecuteModelRequest, MultiModalData,
+from vllm.sequence import (ExecuteModelRequest, EmbeddingSequenceGroupOutput, MultiModalData,
                            PoolerOutput, SamplerOutput, Sequence,
                            SequenceGroup, SequenceStatus,
                            SequenceGroupMetadata)
@@ -552,18 +551,16 @@ def has_unfinished_requests(self) -> bool:
         return self.scheduler.has_unfinished_seqs()
 
     def _process_sequence_group_outputs(
-        self, seq_group: SequenceGroup,
-        outputs: Union[CompletionSequenceGroupOutput,
-                       EmbeddingSequenceGroupOutput]
+        self,
+        seq_group: SequenceGroup,
+        outputs: List[EmbeddingSequenceGroupOutput],
     ) -> None:
+        seq_group.embeddings = outputs[0].embeddings
 
-        if self.model_config.embedding_mode:
-            seq_group.embeddings = outputs.embeddings
-
-            for seq in seq_group.get_seqs():
-                seq.status = SequenceStatus.FINISHED_STOPPED
+        for seq in seq_group.get_seqs():
+            seq.status = SequenceStatus.FINISHED_STOPPED
 
-            return
+        return
 
     def _process_model_outputs(
         self,
@@ -591,7 +588,9 @@ def _process_model_outputs(
             seq_group = scheduled_seq_group.seq_group
             seq_group.update_num_computed_tokens(
                 scheduled_seq_group.token_chunk_size)
-            self._process_sequence_group_outputs(seq_group, outputs)
+            if self.model_config.embedding_mode:
+                self._process_sequence_group_outputs(seq_group, outputs)
+                continue
 
             self.output_processor.process_prompt_logprob(seq_group, outputs)
             if seq_group_meta.do_sample:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index b0c69a23cedd..7a57be0c8891 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,4 +1,3 @@
-import asyncio
 import time
 from typing import AsyncIterator, List, Tuple
 
@@ -13,7 +12,6 @@
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.logger import init_logger
 from vllm.outputs import EmbeddingRequestOutput
-from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
@@ -104,7 +102,7 @@ async def create_embedding(self, request: EmbeddingRequest,
 
                 generators.append(
                     self.engine.generate(prompt_text,
-                                         SamplingParams(),
+                                         pooling_params,
                                          f"{request_id}-{i}",
                                          prompt_token_ids=prompt_ids))
         except ValueError as e:
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index af4c177bc2a2..a2a3a9026034 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -36,7 +36,7 @@ def forward(
         attention_metadata: AttentionMetadata,
     ) -> PoolerOutput:
         """Pools specific information from hidden states based on metadata."""
-        prompt_lens = attention_metadata.prompt_lens_tensor
+        prompt_lens = attention_metadata.prefill_metadata.prompt_lens_tensor
 
         if self.pooling_type == PoolingType.LAST:
             last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index fdcd24ba2e5f..a48b86c53588 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -298,13 +298,7 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -314,8 +308,7 @@ def load_weights(
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
+        for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index de6bc16bf6c3..da255a8bbd75 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Iterable, Tuple
 
 import torch
 from torch import nn
@@ -51,12 +51,5 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, attention_metadata)
 
-    def load_weights(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        load_format: str = "auto",
-        revision: Optional[str] = None,
-    ):
-        self.model.load_weights(model_name_or_path, cache_dir, load_format,
-                                revision)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self.model.load_weights(weights)
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 05f74696d9d6..ad158e9da8f9 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -3,16 +3,16 @@
 import torch
 
 from vllm.attention import AttentionMetadata
-from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VisionLanguageConfig)
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
-from vllm.model_executor.parallel_utils.communication_op import (
-    broadcast_tensor_dict)
+from vllm.lora.request import LoRARequest
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata
-from vllm.worker.model_runner import ModelRunner
+from vllm.worker.model_runner import BatchType, ModelRunner
 
 logger = init_logger(__name__)
 
@@ -25,6 +25,7 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
@@ -34,6 +35,7 @@ def __init__(
                          parallel_config,
                          scheduler_config,
                          device_config,
+                         load_config,
                          lora_config=lora_config,
                          kv_cache_dtype=kv_cache_dtype,
                          is_driver_worker=is_driver_worker,
@@ -45,15 +47,17 @@ def execute_model(
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[torch.Tensor],
     ) -> Optional[PoolerOutput]:
-        (input_tokens, input_positions, attn_metadata, pooling_metadata,
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
          lora_requests, lora_mapping, multi_modal_input
          ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)
 
-        # Execute the model.
-        if attn_metadata.use_cuda_graph:
+        # Currently cuda graph is only supported by the decode phase.
+        prefill_meta = attn_metadata.prefill_metadata
+        decode_meta = attn_metadata.decode_metadata
+        if prefill_meta is None and decode_meta.use_cuda_graph:
             graph_batch_size = input_tokens.shape[0]
             model_executable = self.graph_runners[graph_batch_size]
         else:
@@ -79,27 +83,64 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
-               Set[int], LoRAMapping, torch.Tensor]:
+               Set[LoRARequest], LoRAMapping, torch.Tensor]:
         if self.is_driver_worker:
-            # NOTE: We assume that all sequences in the group are all prompts or
-            # all decodes.
-            is_prompt = seq_group_metadata_list[0].is_prompt
+            prefill_reqs = []
+            decode_reqs = []
+            for seq_group_meta in seq_group_metadata_list:
+                if seq_group_meta.is_prompt:
+                    prefill_reqs.append(seq_group_meta)
+                else:
+                    decode_reqs.append(seq_group_meta)
+
             # Prepare input tensors.
-            if is_prompt:
-                (input_tokens, input_positions, attn_metadata, prompt_lens,
-                 subquery_lens, lora_index_mapping, lora_prompt_mapping,
-                 lora_requests, multi_modal_input
-                 ) = self._prepare_prompt(seq_group_metadata_list)
-            else:
-                logger.warning(
-                    "Embedding model should not have non-prompt inputs.")
-                (input_tokens, input_positions, attn_metadata,
-                 lora_index_mapping, lora_prompt_mapping,
-                 lora_requests) = self._prepare_decode(seq_group_metadata_list)
-                prompt_lens = []
-                multi_modal_input = None
-            pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                     prompt_lens)
+            (
+                input_tokens,
+                input_positions,
+                prefill_attn_metadata,
+                prompt_lens,
+                subquery_lens,
+                lora_index_mapping,
+                lora_prompt_mapping,
+                lora_requests,
+                multi_modal_input,
+                slot_mapping,
+            ) = self._prepare_prompt(prefill_reqs)
+            (
+                decode_input_tokens,
+                decode_input_positions,
+                decode_attn_metadata,
+                decode_lora_index_mapping,
+                decode_lora_prompt_mapping,
+                decode_lora_requests,
+                decode_slot_mapping,
+            ) = self._prepare_decode(decode_reqs)
+
+            if not self.scheduler_config.chunked_prefill_enabled:
+                assert (len(prefill_reqs) and len(decode_reqs)) == 0
+
+            num_prefills = len(prompt_lens)
+            num_prefill_tokens = len(input_tokens)
+            num_decode_tokens = len(decode_input_tokens)
+
+            # Coalesce tensors. Note that attn_metadata is currently not
+            # coalesced for simplicity.
+            input_tokens.extend(decode_input_tokens)
+            input_positions.extend(decode_input_positions)
+            slot_mapping.extend(decode_slot_mapping)
+            lora_index_mapping.extend(decode_lora_index_mapping)
+            lora_prompt_mapping.extend(decode_lora_prompt_mapping)
+            lora_requests.update(decode_lora_requests)
+
+            input_tokens = torch.tensor(input_tokens,
+                                        dtype=torch.long,
+                                        device=self.device)
+            input_positions = torch.tensor(input_positions,
+                                           dtype=torch.long,
+                                           device=self.device)
+            slot_mapping = torch.tensor(slot_mapping,
+                                        dtype=torch.long,
+                                        device=self.device)
 
             if self.lora_config:
                 lora_mapping = LoRAMapping(
@@ -110,31 +151,85 @@ def prepare_input_tensors(
                 lora_mapping = None
 
             # Broadcast the metadata.
+            # If batch contains both prefill and decode, it sends 2 broadcasts.
+            # If it only contains 1 type, it triggers a single broadcast.
+            if (prefill_attn_metadata is not None
+                    and decode_attn_metadata is not None):
+                batch_type = BatchType.MIXED
+            elif prefill_attn_metadata is not None:
+                batch_type = BatchType.PREFILL
+            else:
+                batch_type = BatchType.DECODE
+
             metadata_dict = {
                 "input_tokens": input_tokens,
                 "input_positions": input_positions,
                 "lora_requests": lora_requests,
                 "lora_mapping": lora_mapping,
                 "multi_modal_input": multi_modal_input,
-                "prompt_lens": pooling_metadata.prompt_lens,
+                "num_prefill_tokens": num_prefill_tokens,
+                "num_decode_tokens": num_decode_tokens,
+                "slot_mapping": slot_mapping,
+                "num_prefills": num_prefills,
+                "batch_type": batch_type,
             }
-            metadata_dict.update(attn_metadata.asdict_zerocopy())
+            if prefill_attn_metadata is not None:
+                metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
+            else:
+                assert decode_attn_metadata is not None
+                metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
             broadcast_tensor_dict(metadata_dict, src=0)
+
+            # Broadcast decode attn metadata for mixed batch type.
+            # The additional broadcast costs 300us overhead on 4 A10 GPUs.
+            # We can potentially reduce the overhead by coelescing tensors.
+            if batch_type == BatchType.MIXED:
+                assert decode_attn_metadata is not None
+                metadata_dict = decode_attn_metadata.asdict_zerocopy()
+                broadcast_tensor_dict(metadata_dict, src=0)
         else:
             metadata_dict = broadcast_tensor_dict(src=0)
-            print(metadata_dict)
             input_tokens = metadata_dict.pop("input_tokens")
             input_positions = metadata_dict.pop("input_positions")
+            slot_mapping = metadata_dict.pop("slot_mapping")
+            num_prefills = metadata_dict.pop("num_prefills")
             lora_mapping = metadata_dict.pop("lora_mapping")
             lora_requests = metadata_dict.pop("lora_requests")
             multi_modal_input = metadata_dict.pop("multi_modal_input")
-            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
+            num_prefill_tokens = metadata_dict.pop("num_prefill_tokens")
+            num_decode_tokens = metadata_dict.pop("num_decode_tokens")
+            batch_type = metadata_dict.pop("batch_type")
 
-            prompt_lens = metadata_dict.pop("prompt_lens")
+            # Create an attention metadata.
+            prefill_attn_metadata = None
+            decode_attn_metadata = None
+            if batch_type == BatchType.PREFILL or batch_type == BatchType.MIXED:
+                prefill_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+            else:
+                decode_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+
+            # if it is a mixed batch, decode attn_metadata is broadcasted
+            # separately.
+            if batch_type == BatchType.MIXED:
+                metadata_dict = broadcast_tensor_dict(src=0)
+                decode_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+
+        attn_metadata = AttentionMetadata(
+            num_prefills=num_prefills,
+            slot_mapping=slot_mapping,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            prefill_metadata=prefill_attn_metadata,
+            decode_metadata=decode_attn_metadata,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
 
-            pooling_metadata = PoolingMetadata(seq_groups=None,
-                                               seq_data=None,
-                                               prompt_lens=None)
+        # Prepare PoolingMetadata
+        pooling_metadata = self._prepare_pooling(
+            seq_group_metadata_list, decode_attn_metadata.prompt_lens)
 
         return (input_tokens, input_positions, attn_metadata, pooling_metadata,
                 lora_requests, lora_mapping, multi_modal_input)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c3fc1ef6bd4c..9d815ebaca3c 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -87,7 +87,8 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: CacheEngine
-        self.gpu_cache: List[torch.Tensor]
+        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        self.gpu_cache: List[torch.Tensor] = None
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":

From 97a493d405d5b490c12e7f43cb3b36518b745498 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 24 Apr 2024 17:39:48 -0400
Subject: [PATCH 10/41] Update vllm/engine/async_llm_engine.py

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/engine/async_llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index c7d1c22ce9f0..63a6af84d90a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -671,7 +671,7 @@ async def encode(
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None
     ) -> AsyncIterator[EmbeddingRequestOutput]:
-        """Generate outputs for a request.
+        """Generate outputs for a request from an embedding model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
         request into the waiting queue of the LLMEngine and streams the outputs

From 182ff0964fb8e2089e7f6b7d187e16e96b5db835 Mon Sep 17 00:00:00 2001
From: Chang Su <csu272@usc.edu>
Date: Wed, 24 Apr 2024 16:45:27 -0700
Subject: [PATCH 11/41] Apply suggestions from code review

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/engine/async_llm_engine.py       | 8 ++++----
 vllm/worker/embedding_model_runner.py | 2 +-
 vllm/worker/model_runner.py           | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 63a6af84d90a..f74491e860a4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -605,8 +605,8 @@ async def generate(
             multi_modal_data: Multi modal data per request.
 
         Yields:
-            The output `RequestOutput` or `EmbeddingRequestOutput` objects
-            from the LLMEngine for the request.
+            The output `CompletionRequestOutput` objects from the LLMEngine
+            for the request.
 
         Details:
             - If the engine is not running, start the background loop,
@@ -688,8 +688,8 @@ async def encode(
             multi_modal_data: Multi modal data per request.
 
         Yields:
-            The output `RequestOutput` or `EmbeddingRequestOutput` objects
-            from the LLMEngine for the request.
+            The output `EmbeddingRequestOutput` objects from the LLMEngine 
+            for the request.
 
         Details:
             - If the engine is not running, start the background loop,
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index ad158e9da8f9..a77c801746f2 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -47,7 +47,7 @@ def execute_model(
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[torch.Tensor],
     ) -> Optional[PoolerOutput]:
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+        (input_tokens, input_positions, attn_metadata, pooling_metadata,
          lora_requests, lora_mapping, multi_modal_input
          ) = self.prepare_input_tensors(seq_group_metadata_list)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index e3e5c3d09a5d..02884c8e5c53 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1175,6 +1175,6 @@ def _is_block_tables_empty(block_tables: Union[None, Dict]):
     if block_tables is None:
         return True
     if isinstance(block_tables, dict) and all(
-            not value for value in block_tables.values()):
+            value is None for value in block_tables.values()):
         return True
     return False

From 29f888ee2c4bb9460b475b79d418900390e363d0 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 24 Apr 2024 16:47:29 -0700
Subject: [PATCH 12/41] Resolve comments

- Fix tests
- Add PoolingTensor in pooling_metadata.py
- Use PoolingMetadata instead of AttentionMetadata in pooler
- Revert LLM.generate() to return RequestOutput
- Add batch tests for embedding in test_openai_server.py
- Fix CI and remove --embedding-mode in arg_utils
---
 requirements-dev.txt                          |  1 +
 .../output_processor/test_multi_step.py       | 12 ++--
 tests/entrypoints/test_openai_server.py       | 55 ++++++++++++++++---
 tests/models/test_llama_embedding.py          |  5 +-
 tests/samplers/test_logits_processor.py       |  6 +-
 tests/samplers/test_seeded_generate.py        |  2 +-
 vllm/config.py                                | 11 ++--
 vllm/core/embedding_model_block_manager.py    | 10 ++--
 vllm/engine/async_llm_engine.py               |  7 +--
 vllm/engine/llm_engine.py                     |  4 +-
 vllm/entrypoints/llm.py                       |  7 ++-
 vllm/model_executor/layers/pooler.py          |  8 ++-
 vllm/model_executor/models/__init__.py        |  4 ++
 vllm/model_executor/models/llama_embedding.py |  7 ++-
 vllm/model_executor/pooling_metadata.py       | 37 +++++++++++++
 vllm/worker/embedding_model_runner.py         |  2 +-
 vllm/worker/worker.py                         |  2 +-
 17 files changed, 130 insertions(+), 50 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index e6d375cbafa3..f943ff9645e3 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -25,6 +25,7 @@ requests
 ray
 peft
 awscli
+sentence-transformers # required for embedding test
 
 # Benchmarking
 aiohttp
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
index 6da3da091db7..2bf4bf69da20 100644
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -9,8 +9,8 @@
 from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput,
-                           SequenceStatus)
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.utils import Counter
 
@@ -51,7 +51,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
     new_token_ids = list(range(num_new_tokens))
 
     outputs = [
-        SequenceGroupOutput(
+        CompletionSequenceGroupOutput(
             samples=[
                 SequenceOutput(
                     parent_seq_id=seq.seq_id,
@@ -103,7 +103,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
     new_token_ids = list(range(num_new_tokens))
 
     outputs = [
-        SequenceGroupOutput(
+        CompletionSequenceGroupOutput(
             samples=[
                 SequenceOutput(
                     parent_seq_id=seq.seq_id,
@@ -170,7 +170,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
     new_token_ids[eos_index] = eos_token_id
 
     outputs = [
-        SequenceGroupOutput(
+        CompletionSequenceGroupOutput(
             samples=[
                 SequenceOutput(
                     parent_seq_id=seq.seq_id,
@@ -239,7 +239,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
     new_token_ids[eos_index] = eos_token_id
 
     outputs = [
-        SequenceGroupOutput(
+        CompletionSequenceGroupOutput(
             samples=[
                 SequenceOutput(
                     parent_seq_id=seq.seq_id,
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 7d3b6de329c1..79f194c58ee2 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -123,8 +123,15 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="session")
-def embedding_server(zephyr_lora_files):
+def ray_context():
+    # Initialize Ray once for the entire session
     ray.init()
+    yield
+    ray.shutdown()
+
+
+@pytest.fixture(scope="session")
+def embedding_server(ray_context, zephyr_lora_files):
     server_runner = ServerRunner.remote([
         "--model",
         EMBEDDING_MODEL_NAME,
@@ -137,12 +144,10 @@ def embedding_server(zephyr_lora_files):
     ])
     ray.get(server_runner.ready.remote())
     yield server_runner
-    ray.shutdown()
 
 
 @pytest.fixture(scope="session")
-def server(zephyr_lora_files):
-    ray.init()
+def server(ray_context, zephyr_lora_files):
     server_runner = ServerRunner.remote([
         "--model",
         MODEL_NAME,
@@ -166,7 +171,6 @@ def server(zephyr_lora_files):
     ])
     ray.get(server_runner.ready.remote())
     yield server_runner
-    ray.shutdown()
 
 
 @pytest.fixture(scope="module")
@@ -481,12 +485,11 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
 
 
 @pytest.mark.parametrize(
-    # just test 1 lora hereafter
     "model_name",
     [EMBEDDING_MODEL_NAME],
 )
-async def test_embedding(embedding_server, client: openai.AsyncOpenAI,
-                         model_name: str):
+async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
+                                model_name: str):
     input = [
         "The chef prepared a delicious meal.",
     ]
@@ -519,6 +522,42 @@ async def test_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 5
 
 
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
+                               model_name: str):
+    # test List[str]
+    inputs = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=inputs,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 4096
+
+    # test List[List[int]]
+    inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+              [25, 32, 64, 77]]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=inputs,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
+
+
 async def test_logits_bias(server, client: openai.AsyncOpenAI):
     prompt = "Hello, my name is"
     max_tokens = 5
diff --git a/tests/models/test_llama_embedding.py b/tests/models/test_llama_embedding.py
index 868b0d0c42d6..ffbea984e414 100644
--- a/tests/models/test_llama_embedding.py
+++ b/tests/models/test_llama_embedding.py
@@ -20,10 +20,7 @@ def compare_embeddings(embeddings1, embeddings2):
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-# @pytest.mark.skip(
-#     "Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
-#     "scalar type BFloat16 but found Half (only in CI).")
+@pytest.mark.parametrize("dtype", ["half"])
 def test_models(
     hf_runner,
     vllm_runner,
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 3788e9e9752f..be4c2ea1b781 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -36,14 +36,14 @@ def pick_vllm(token_ids, logits):
     # test logits_processors when prompt_logprobs is not None
     vllm_model.model._add_request(
         prompt=example_prompts[0],
-        sampling_params=params_with_logprobs,
+        params=params_with_logprobs,
         prompt_token_ids=None,
     )
 
     # test prompt_logprobs is not None
     vllm_model.model._add_request(
         prompt=example_prompts[1],
-        sampling_params=SamplingParams(
+        params=SamplingParams(
             prompt_logprobs=3,
             max_tokens=max_tokens,
         ),
@@ -53,7 +53,7 @@ def pick_vllm(token_ids, logits):
     # test grouped requests
     vllm_model.model._add_request(
         prompt=example_prompts[2],
-        sampling_params=SamplingParams(max_tokens=max_tokens),
+        params=SamplingParams(max_tokens=max_tokens),
         prompt_token_ids=None,
     )
 
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index 3cd659cef58d..ce4501bbf71e 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -60,7 +60,7 @@ def test_random_sample_with_seed(
             llm._add_request(
                 prompt=prompt,
                 prompt_token_ids=None,
-                sampling_params=params,
+                params=params,
             )
 
     results = llm._run_engine(use_tqdm=False)
diff --git a/vllm/config.py b/vllm/config.py
index 8248a565c8dd..184e6aac4501 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,6 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
+from vllm.model_executor.models import ModelRegistry
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron
 
@@ -22,6 +23,7 @@
 logger = init_logger(__name__)
 
 _GB = 1 << 30
+_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 
 
 class ModelConfig:
@@ -214,8 +216,8 @@ def _verify_cuda_graph(self) -> None:
 
     def _check_embedding_mode(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
-        pattern = r".*Model$"
-        return any(re.match(pattern, arch) for arch in architectures)
+        return any(
+            ModelRegistry.is_embedding_model(arch) for arch in architectures)
 
     def verify_with_parallel_config(
         self,
@@ -619,8 +621,9 @@ def __init__(
                 # and TTFT on A100. Note it is not optimized for throughput.
                 self.max_num_batched_tokens = 512
             elif embedding_mode:
-                # For embedding, choose 32768 for higher throughput
-                self.max_num_batched_tokens = max(max_model_len, 32768)
+                # For embedding, choose specific value for higher throughput
+                self.max_num_batched_tokens = max(
+                    max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS)
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.
diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py
index f17c0ac03ae6..413e231bcdf0 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -37,7 +37,7 @@ def append_slots(
         seq: Sequence,
         num_lookahead_slots: int,
     ) -> Dict[int, List[int]]:
-        return {}
+        return None
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         pass
@@ -48,20 +48,20 @@ def can_swap_in(self, seq_group: SequenceGroup,
 
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
-        return {}
+        return None
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         return True
 
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
-        return {}
+        return None
 
     def free(self, seq: Sequence) -> None:
         # No operation on free
         return
 
     def get_block_table(self, seq: Sequence) -> List[int]:
-        return []
+        return None
 
     def get_num_free_gpu_blocks(self) -> int:
         return 1
@@ -78,7 +78,7 @@ def access_all_blocks_in_seq(
 
     def get_common_computed_block_ids(self,
                                       seq_group: SequenceGroup) -> List[int]:
-        return []
+        return None
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         pass
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index f74491e860a4..d90311916756 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -656,7 +656,6 @@ async def generate(
                 prompt,
                 sampling_params,
                 prompt_token_ids,
-                None,  # No arrival time
                 lora_request,
                 multi_modal_data,
         ):
@@ -737,7 +736,6 @@ async def encode(
                 prompt,
                 pooling_params,
                 prompt_token_ids,
-                None,
                 lora_request,
                 multi_modal_data,
         ):
@@ -749,15 +747,12 @@ async def process_request(
         prompt: Optional[str],
         params: Union[SamplingParams, PoolingParams],
         prompt_token_ids: Optional[List[int]] = None,
-        arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
     ) -> AsyncIterator[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
         """Common logic to process requests with SamplingParams or
         PoolingParams."""
-        # Preprocess the request and set default arrival time if not provided
-        if arrival_time is None:
-            arrival_time = time.time()
+        arrival_time = time.time()
 
         stream = await self.add_request(
             request_id,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 95de2cb12538..7d31bba5e5d5 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -26,8 +26,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (ExecuteModelRequest, EmbeddingSequenceGroupOutput, MultiModalData,
                            PoolerOutput, SamplerOutput, Sequence,
-                           SequenceGroup, SequenceStatus,
-                           SequenceGroupMetadata)
+                           SequenceGroup, SequenceGroupMetadata,
+                           SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                      get_tokenizer_group)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index a7bd3c79c27f..fbe099631045 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -8,7 +8,8 @@
 from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import CompletionRequestOutput, EmbeddingRequestOutput
+from vllm.outputs import (CompletionRequestOutput, EmbeddingRequestOutput,
+                          RequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import MultiModalData
@@ -147,7 +148,7 @@ def generate(
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
-    ) -> List[CompletionRequestOutput]:
+    ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
         NOTE: This class automatically batches the given prompts, considering
@@ -168,7 +169,7 @@ def generate(
             multi_modal_data: Multi modal data.
 
         Returns:
-            A list of `CompletionRequestOutput` objects containing the
+            A list of `RequestOutput` objects containing the
             generated completions in the same order as the input prompts.
         """
         if sampling_params is None:
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index a2a3a9026034..445b30b8c6e9 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -3,7 +3,8 @@
 import torch
 import torch.nn as nn
 
-from vllm.attention import AttentionMetadata
+from vllm.model_executor.pooling_metadata import (PoolingMetadata,
+                                                  PoolingTensors)
 from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput
 
 
@@ -33,10 +34,11 @@ def __init__(self, pooling_type: PoolingType, normalize: bool):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_metadata: AttentionMetadata,
+        pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         """Pools specific information from hidden states based on metadata."""
-        prompt_lens = attention_metadata.prefill_metadata.prompt_lens_tensor
+        prompt_lens = PoolingTensors.from_pooling_metadata(
+            pooling_metadata, hidden_states.device).prompt_lens
 
         if self.pooling_type == PoolingType.LAST:
             last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 82344c4d8e7c..d90bb8100169 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -119,6 +119,10 @@ def register_model(model_arch: str, model_cls: Type[nn.Module]):
         global _OOT_MODELS
         _OOT_MODELS[model_arch] = model_cls
 
+    @staticmethod
+    def is_embedding_model(model_arch: str) -> bool:
+        return model_arch in _EMBEDDING_MODELS
+
 
 __all__ = [
     "ModelRegistry",
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index da255a8bbd75..18570eae7fd3 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Iterable, Tuple
+from typing import Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -9,6 +9,7 @@
 from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import PoolerOutput
 
 
@@ -47,9 +48,9 @@ def forward(
     def pooler(
         self,
         hidden_states: torch.Tensor,
-        attention_metadata: AttentionMetadata,
+        pooling_metadata: PoolingMetadata,
     ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, attention_metadata)
+        return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         self.model.load_weights(weights)
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index e34308bf40ba..28eeec0f618f 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -1,6 +1,10 @@
+from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 
+import torch
+
 from vllm.pooling_params import PoolingParams
+from vllm.utils import is_pin_memory_available
 
 
 class PoolingMetadata:
@@ -30,3 +34,36 @@ def __repr__(self) -> str:
                 f"seq_groups={self.seq_groups}, "
                 f"seq_data={self.seq_data}, "
                 f"prompt_lens={self.prompt_lens}, ")
+
+
+@dataclass
+class PoolingTensors:
+    """Tensors for pooling."""
+
+    prompt_lens: torch.Tensor
+
+    @classmethod
+    def from_pooling_metadata(
+        cls,
+        pooling_metadata: "PoolingMetadata",
+        device: torch.device,
+    ) -> "PoolingTensors":
+        """
+        Create PoolingTensors from PoolingMetadata.
+
+        Args:
+            pooling_metadata: PoolingMetadata instance to convert.
+            device: Device to store the tensors.
+        """
+        # Convert prompt lengths to tensor
+        pin_memory = is_pin_memory_available()
+
+        prompt_lens_t = torch.tensor(
+            pooling_metadata.prompt_lens,
+            device="cpu",
+            dtype=torch.long,
+            pin_memory=pin_memory,
+        )
+
+        return cls(prompt_lens=prompt_lens_t.to(device=device,
+                                                non_blocking=True), )
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index a77c801746f2..ce62ba87fde5 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -77,7 +77,7 @@ def execute_model(
         hidden_states = model_executable(**execute_model_kwargs)
 
         return self.model.pooler(hidden_states=hidden_states,
-                                 attention_metadata=attn_metadata)
+                                 pooling_metadata=pooling_metadata)
 
     def prepare_input_tensors(
         self,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 9d815ebaca3c..d9c78ee0cdfd 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -88,7 +88,7 @@ def __init__(
         # initialize_cache.
         self.cache_engine: CacheEngine
         # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.gpu_cache: List[torch.Tensor] = None
+        self.gpu_cache: Optional[List[torch.tensor]] = None
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":

From a7dc48472f86f7e33fef9a404f7c39d7c2ea9de8 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Thu, 25 Apr 2024 00:06:05 -0700
Subject: [PATCH 13/41] Fix EntryPointsTest, ModelsTest and rebase

- Fix rebase errors
- Format files
- Suppress mypy type error in embedding_block_manager
- Add spec_decode_worker_metrics in PoolerOutput since it is required
in LLMEngine._get_stats

Fix EntryPoints Test
- It keeps running after introducing embedding_server in
test_openai_server.py. Try to set fixture scope to `module` to reduce
ray resource.
---
 tests/entrypoints/openai/test_serving_chat.py |   1 +
 tests/entrypoints/test_openai_server.py       | 193 +++++++++---------
 tests/models/test_llama_embedding.py          |   2 +-
 vllm/core/embedding_model_block_manager.py    |  14 +-
 vllm/engine/llm_engine.py                     |  12 +-
 vllm/executor/gpu_executor.py                 |   4 +-
 vllm/model_executor/models/llama_embedding.py |   9 +-
 vllm/outputs.py                               |   4 +
 vllm/sequence.py                              |   2 +
 vllm/spec_decode/spec_decode_worker.py        |   5 +-
 vllm/worker/embedding_model_runner.py         |  16 +-
 vllm/worker/model_runner.py                   |   7 +-
 12 files changed, 136 insertions(+), 133 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 13e2e372cef3..74b49726734b 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -14,6 +14,7 @@ class MockModelConfig:
     tokenizer_mode = "auto"
     max_model_len = 100
     tokenizer_revision = None
+    embedding_mode = False
 
 
 @dataclass
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 79f194c58ee2..c22ac4507658 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -122,55 +122,52 @@ def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
-@pytest.fixture(scope="session")
-def ray_context():
-    # Initialize Ray once for the entire session
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files):
     ray.init()
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="session")
-def embedding_server(ray_context, zephyr_lora_files):
     server_runner = ServerRunner.remote([
         "--model",
-        EMBEDDING_MODEL_NAME,
+        MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
         "--max-model-len",
         "8192",
         "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
     ])
     ray.get(server_runner.ready.remote())
     yield server_runner
+    ray.shutdown()
 
 
-@pytest.fixture(scope="session")
-def server(ray_context, zephyr_lora_files):
+@pytest.fixture(scope="module")
+def embedding_server(zephyr_lora_files):
+    ray.shutdown()
+    ray.init()
     server_runner = ServerRunner.remote([
         "--model",
-        MODEL_NAME,
+        EMBEDDING_MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
         "--max-model-len",
         "8192",
         "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "128",
     ])
     ray.get(server_runner.ready.remote())
     yield server_runner
+    ray.shutdown()
 
 
 @pytest.fixture(scope="module")
@@ -484,80 +481,6 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
     assert texts[0] == texts[1]
 
 
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
-                                model_name: str):
-    input = [
-        "The chef prepared a delicious meal.",
-    ]
-
-    # test single embedding
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 9
-    assert embeddings.usage.total_tokens == 9
-
-    # test using token IDs
-    input = [1, 1, 1, 1, 1]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 5
-    assert embeddings.usage.total_tokens == 5
-
-
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
-                               model_name: str):
-    # test List[str]
-    inputs = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
-    ]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=inputs,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 3
-    assert len(embeddings.data[0].embedding) == 4096
-
-    # test List[List[int]]
-    inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-              [25, 32, 64, 77]]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=inputs,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 4
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 17
-    assert embeddings.usage.total_tokens == 17
-
-
 async def test_logits_bias(server, client: openai.AsyncOpenAI):
     prompt = "Hello, my name is"
     max_tokens = 5
@@ -987,5 +910,79 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
                 or "less_than_equal" in exc_info.value.message)
 
 
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
+                                model_name: str):
+    input = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 9
+    assert embeddings.usage.total_tokens == 9
+
+    # test using token IDs
+    input = [1, 1, 1, 1, 1]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=input,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
+                               model_name: str):
+    # test List[str]
+    inputs = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=inputs,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 4096
+
+    # test List[List[int]]
+    inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+              [25, 32, 64, 77]]
+    embeddings = await client.embeddings.create(
+        model=model_name,
+        input=inputs,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert embeddings.data is not None and len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/models/test_llama_embedding.py b/tests/models/test_llama_embedding.py
index ffbea984e414..59bf054913f7 100644
--- a/tests/models/test_llama_embedding.py
+++ b/tests/models/test_llama_embedding.py
@@ -32,7 +32,7 @@ def test_models(
     hf_outputs = hf_model.encode(example_prompts)
     del hf_model
 
-    vllm_model = vllm_runner(model, dtype=dtype, max_model_len=32768)
+    vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.encode(example_prompts)
     del vllm_model
 
diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py
index 413e231bcdf0..aed7bb9307ff 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -37,31 +37,31 @@ def append_slots(
         seq: Sequence,
         num_lookahead_slots: int,
     ) -> Dict[int, List[int]]:
-        return None
+        return None  # type: ignore
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         pass
 
     def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> bool:
-        return True
+                    num_lookahead_slots: int) -> AllocStatus:
+        return AllocStatus.OK
 
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
-        return None
+        return None  # type: ignore
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         return True
 
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
-        return None
+        return None  # type: ignore
 
     def free(self, seq: Sequence) -> None:
         # No operation on free
         return
 
     def get_block_table(self, seq: Sequence) -> List[int]:
-        return None
+        return None  # type: ignore
 
     def get_num_free_gpu_blocks(self) -> int:
         return 1
@@ -78,7 +78,7 @@ def access_all_blocks_in_seq(
 
     def get_common_computed_block_ids(self,
                                       seq_group: SequenceGroup) -> List[int]:
-        return None
+        return None  # type: ignore
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         pass
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7d31bba5e5d5..3101b81e19f4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -24,9 +24,9 @@
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (ExecuteModelRequest, EmbeddingSequenceGroupOutput, MultiModalData,
-                           PoolerOutput, SamplerOutput, Sequence,
-                           SequenceGroup, SequenceGroupMetadata,
+from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
+                           MultiModalData, PoolerOutput, SamplerOutput,
+                           Sequence, SequenceGroup, SequenceGroupMetadata,
                            SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
@@ -803,8 +803,10 @@ def _get_stats(
                         seq.get_output_len()
                         for seq in seq_group.get_finished_seqs()
                     ])
-                    best_of_requests.append(seq_group.sampling_params.best_of)
-                    n_requests.append(seq_group.sampling_params.n)
+                    if seq_group.sampling_params is not None:
+                        best_of_requests.append(
+                            seq_group.sampling_params.best_of)
+                        n_requests.append(seq_group.sampling_params.n)
                     finished_reason_requests.extend([
                         SequenceStatus.get_finished_reason(seq.status)
                         for seq in seq_group.get_finished_seqs()
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 47abdfca4556..2b72b31b5f07 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -123,8 +123,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
     def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[Union[SamplerOutput, PoolerOutput]]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> List[Union[SamplerOutput, PoolerOutput]]:
         output = self.driver_worker.execute_model(execute_model_req)
         return output
 
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index 18570eae7fd3..e8b61636b5dd 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -2,11 +2,8 @@
 
 import torch
 from torch import nn
-from transformers import LlamaConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import LoRAConfig
-from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.pooling_metadata import PoolingMetadata
@@ -26,12 +23,10 @@ class LlamaEmbeddingModel(nn.Module):
 
     def __init__(
         self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        **kwargs,
     ) -> None:
         super().__init__()
-        self.model = LlamaModel(config, linear_method, lora_config)
+        self.model = LlamaModel(**kwargs)
         self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
 
     def forward(
diff --git a/vllm/outputs.py b/vllm/outputs.py
index c448c7eac773..54f11d791bd6 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -137,6 +137,8 @@ def __init__(
     @classmethod
     def from_seq_group(cls,
                        seq_group: SequenceGroup) -> "CompletionRequestOutput":
+        if seq_group.sampling_params is None:
+            raise ValueError("Sampling parameters are missing in seq_group.")
         seqs = seq_group.get_seqs()
         if len(seqs) == 1:
             top_n_seqs = seqs
@@ -213,6 +215,8 @@ def __init__(self, request_id: str, outputs: 'EmbeddingOutput',
     @classmethod
     def from_seq_group(cls,
                        seq_group: 'SequenceGroup') -> "EmbeddingRequestOutput":
+        if seq_group.embeddings is None:
+            raise ValueError("Embeddings are missing in seq_group.")
         output = EmbeddingOutput(seq_group.embeddings)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 4529cc5cec01..66a6963f4054 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -819,6 +819,8 @@ class PoolerOutput:
     """The output from a pooling operation in the Llama model."""
     outputs: List[EmbeddingSequenceGroupOutput]
 
+    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+
     def __getitem__(self, idx: int):
         return self.outputs[idx]
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index d232393f0fa5..a4e759095b29 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -5,9 +5,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
-                           Logprob, SamplerOutput, SequenceGroupMetadata,
-                           SequenceOutput)
+from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
+                           SequenceGroupMetadata)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index ce62ba87fde5..70123b674a9f 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -44,7 +44,7 @@ def __init__(
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
         kv_caches: List[torch.Tensor],
     ) -> Optional[PoolerOutput]:
         (input_tokens, input_positions, attn_metadata, pooling_metadata,
@@ -81,7 +81,7 @@ def execute_model(
 
     def prepare_input_tensors(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
                Set[LoRARequest], LoRAMapping, torch.Tensor]:
         if self.is_driver_worker:
@@ -116,6 +116,10 @@ def prepare_input_tensors(
                 decode_slot_mapping,
             ) = self._prepare_decode(decode_reqs)
 
+            # Prepare PoolingMetadata
+            pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                     prompt_lens)
+
             if not self.scheduler_config.chunked_prefill_enabled:
                 assert (len(prefill_reqs) and len(decode_reqs)) == 0
 
@@ -210,6 +214,10 @@ def prepare_input_tensors(
                 decode_attn_metadata = self.attn_backend.make_metadata(
                     **metadata_dict)
 
+            pooling_metadata = PoolingMetadata(seq_groups=None,
+                                               seq_data=None,
+                                               prompt_lens=None)
+
             # if it is a mixed batch, decode attn_metadata is broadcasted
             # separately.
             if batch_type == BatchType.MIXED:
@@ -227,10 +235,6 @@ def prepare_input_tensors(
             kv_cache_dtype=self.kv_cache_dtype,
         )
 
-        # Prepare PoolingMetadata
-        pooling_metadata = self._prepare_pooling(
-            seq_group_metadata_list, decode_attn_metadata.prompt_lens)
-
         return (input_tokens, input_positions, attn_metadata, pooling_metadata,
                 lora_requests, lora_mapping, multi_modal_input)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 02884c8e5c53..f9ab1192829e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -301,10 +301,9 @@ def _prepare_prompt(
                 lora_requests.add(seq_group_metadata.lora_request)
 
             lora_index_mapping += [lora_id] * (seq_len - context_len)
-            lora_prompt_mapping.extend(
-                [lora_id] *
-                (seq_len - context_len
-                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+            lora_prompt_mapping.extend([lora_id] * (
+                seq_len - context_len if seq_group_metadata.sampling_params
+                and seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
             if seq_group_metadata.multi_modal_data:
                 multi_modal_input_list.append(

From a744fd17d45b130d21927b083c10c9d75977f7e9 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 7 May 2024 11:39:30 -0700
Subject: [PATCH 14/41] Revert `CompletionRequestOutput` to `RequestOutput`

---
 docs/source/getting_started/quickstart.rst    |  2 +-
 docs/source/quantization/auto_awq.rst         |  2 +-
 docs/source/quantization/fp8_e5m2_kvcache.rst |  2 +-
 examples/llm_engine_example.py                |  4 +-
 examples/multilora_inference.py               |  4 +-
 examples/offline_inference.py                 |  2 +-
 examples/offline_inference_distributed.py     |  2 +-
 examples/offline_inference_neuron.py          |  2 +-
 examples/offline_inference_with_prefix.py     |  2 +-
 tests/async_engine/test_request_tracker.py    |  4 +-
 vllm/__init__.py                              |  4 +-
 vllm/engine/async_llm_engine.py               | 22 ++++-----
 vllm/engine/llm_engine.py                     | 10 ++---
 vllm/entrypoints/llm.py                       | 10 ++---
 vllm/entrypoints/openai/serving_chat.py       | 12 ++---
 vllm/entrypoints/openai/serving_completion.py | 13 +++---
 vllm/outputs.py                               | 45 +++++--------------
 17 files changed, 55 insertions(+), 87 deletions(-)

diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 03758d630f4a..7c44a96865a5 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -48,7 +48,7 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
 
     llm = LLM(model="facebook/opt-125m")
 
-Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``CompletionRequestOutput`` objects, which include all the output tokens.
+Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
 
 .. code-block:: python
 
diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst
index e060dd29af0f..bbbb9aee78b3 100644
--- a/docs/source/quantization/auto_awq.rst
+++ b/docs/source/quantization/auto_awq.rst
@@ -65,7 +65,7 @@ AWQ models are also supported directly through the LLM entrypoint:
 
     # Create an LLM.
     llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-    # Generate texts from the prompts. The output is a list of CompletionRequestOutput objects
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst
index 749caf623d31..337252a00aef 100644
--- a/docs/source/quantization/fp8_e5m2_kvcache.rst
+++ b/docs/source/quantization/fp8_e5m2_kvcache.rst
@@ -22,7 +22,7 @@ Here is an example of how to enable this feature:
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     # Create an LLM.
     llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
-    # Generate texts from the prompts. The output is a list of CompletionRequestOutput objects
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
index 790c63ac2c3b..a81c4b3e399c 100644
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -1,7 +1,7 @@
 import argparse
 from typing import List, Tuple
 
-from vllm import CompletionRequestOutput, EngineArgs, LLMEngine, SamplingParams
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 
 
 def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
@@ -34,7 +34,7 @@ def process_requests(engine: LLMEngine,
             engine.add_request(str(request_id), prompt, sampling_params)
             request_id += 1
 
-        request_outputs: List[CompletionRequestOutput] = engine.step()
+        request_outputs: List[RequestOutput] = engine.step()
 
         for request_output in request_outputs:
             if request_output.finished:
diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
index 476466d3f033..6aa25b4689ec 100644
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -9,7 +9,7 @@
 
 from huggingface_hub import snapshot_download
 
-from vllm import CompletionRequestOutput, EngineArgs, LLMEngine, SamplingParams
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.lora.request import LoRARequest
 
 
@@ -87,7 +87,7 @@ def process_requests(engine: LLMEngine,
                                lora_request=lora_request)
             request_id += 1
 
-        request_outputs: List[CompletionRequestOutput] = engine.step()
+        request_outputs: List[RequestOutput] = engine.step()
 
         for request_output in request_outputs:
             if request_output.finished:
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index bac7640174b1..6ac9446b4102 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -13,7 +13,7 @@
 # Create an LLM.
 llm = LLM(model="facebook/opt-125m")
 # Generate texts from the prompts. The output is a list of
-# CompletionRequestOutput objects that contain the prompt, generated text, and
+# RequestOutput objects that contain the prompt, generated text, and
 # other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
index 26bc8d683c5e..aeca8544ccb9 100644
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -25,7 +25,7 @@ def __init__(self):
 
     def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
         # Generate texts from the prompts.
-        # The output is a list of CompletionRequestOutput objects that contain
+        # The output is a list of RequestOutput objects that contain
         # the prompt, generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)
         prompt = []
diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
index 539634791b37..e4986ad8419c 100755
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@@ -27,7 +27,7 @@
     device="neuron",
     tensor_parallel_size=2)
 # Generate texts from the prompts. The output is a list of
-# CompletionRequestOutput objects that contain the prompt, generated text, and
+# RequestOutput objects that contain the prompt, generated text, and
 # other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index d8bc2a5ea6a5..20f4d3bc7f01 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -27,7 +27,7 @@
 generating_prompts = [prefix + prompt for prompt in prompts]
 
 # Generate texts from the prompts. The output is a list of
-# CompletionRequestOutput objects that contain the prompt, generated text, and
+# RequestOutput objects that contain the prompt, generated text, and
 # other information.
 outputs = llm.generate(generating_prompts, sampling_params)
 # Print the outputs.
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index f9816540a0b4..7b1f4a9e1eb2 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,7 +1,7 @@
 import pytest
 
 from vllm.engine.async_llm_engine import RequestTracker
-from vllm.outputs import CompletionRequestOutput
+from vllm.outputs import RequestOutput
 
 
 @pytest.mark.asyncio
@@ -55,7 +55,7 @@ async def test_request_tracker():
     stream_5 = tracker.add_request("5")
     assert tracker.new_requests_event.is_set()
     tracker.process_request_output(
-        CompletionRequestOutput("2", "output", [], [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], finished=True))
     await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
     assert not tracker.new_requests_event.is_set()
diff --git a/vllm/__init__.py b/vllm/__init__.py
index f30fad73e03a..74674ca0d12a 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -6,7 +6,7 @@
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, CompletionRequestOutput,
+from vllm.outputs import (CompletionOutput, EmbeddingOutput,
                           EmbeddingRequestOutput, RequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
@@ -19,7 +19,7 @@
     "SamplingParams",
     "RequestOutput",
     "CompletionOutput",
-    "CompletionRequestOutput",
+    "EmbeddingOutput",
     "EmbeddingRequestOutput",
     "LLMEngine",
     "EngineArgs",
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index d90311916756..a31f10b7748d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -14,7 +14,7 @@
 from vllm.executor.ray_utils import initialize_ray_cluster, ray
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import CompletionRequestOutput, EmbeddingRequestOutput
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, MultiModalData, SamplerOutput
@@ -56,10 +56,8 @@ def __init__(self, request_id: str) -> None:
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(
-        self, item: Union[CompletionRequestOutput, EmbeddingRequestOutput,
-                          Exception]
-    ) -> None:
+    def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
+                              Exception]) -> None:
         if self._finished:
             return
         self._queue.put_nowait(item)
@@ -75,8 +73,7 @@ def finished(self) -> bool:
     def __aiter__(self):
         return self
 
-    async def __anext__(
-            self) -> Union[CompletionRequestOutput, EmbeddingRequestOutput]:
+    async def __anext__(self) -> Union[RequestOutput, EmbeddingRequestOutput]:
         result = await self._queue.get()
         if isinstance(result, Exception):
             raise result
@@ -113,7 +110,7 @@ def propagate_exception(self,
                 self.abort_request(rid)
 
     def process_request_output(self,
-                               request_output: Union[CompletionRequestOutput,
+                               request_output: Union[RequestOutput,
                                                      EmbeddingRequestOutput],
                                *,
                                verbose: bool = False) -> None:
@@ -203,8 +200,7 @@ class _AsyncLLMEngine(LLMEngine):
     """Extension of LLMEngine to add async methods."""
 
     async def step_async(
-            self
-    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+            self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -587,7 +583,7 @@ async def generate(
         prompt_token_ids: Optional[List[int]] = None,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None
-    ) -> AsyncIterator[CompletionRequestOutput]:
+    ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
         Generate outputs for a request. This method is a coroutine. It adds the
@@ -605,7 +601,7 @@ async def generate(
             multi_modal_data: Multi modal data per request.
 
         Yields:
-            The output `CompletionRequestOutput` objects from the LLMEngine
+            The output `RequestOutput` objects from the LLMEngine
             for the request.
 
         Details:
@@ -749,7 +745,7 @@ async def process_request(
         prompt_token_ids: Optional[List[int]] = None,
         lora_request: Optional[LoRARequest] = None,
         multi_modal_data: Optional[MultiModalData] = None,
-    ) -> AsyncIterator[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+    ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Common logic to process requests with SamplingParams or
         PoolingParams."""
         arrival_time = time.time()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3101b81e19f4..46fa41030b4a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -20,7 +20,7 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import (CompletionRequestOutput, EmbeddingRequestOutput,
+from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
@@ -568,7 +568,7 @@ def _process_model_outputs(
         scheduled_seq_groups: List[ScheduledSequenceGroup],
         ignored_seq_groups: List[SequenceGroup],
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Apply the model output to the sequences in the scheduled seq groups.
 
         Returns RequestOutputs that can be returned to the client.
@@ -600,7 +600,7 @@ def _process_model_outputs(
         self.scheduler.free_finished_seq_groups()
 
         # Create the outputs.
-        request_outputs: List[Union[CompletionRequestOutput,
+        request_outputs: List[Union[RequestOutput,
                                     EmbeddingRequestOutput]] = []
         for scheduled_seq_group in scheduled_seq_groups:
             seq_group = scheduled_seq_group.seq_group
@@ -612,9 +612,7 @@ def _process_model_outputs(
             request_outputs.append(request_output)
         return request_outputs
 
-    def step(
-            self
-    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+    def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
         .. figure:: https://i.imgur.com/sv2HssD.png
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fbe099631045..50f6128581b1 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -8,8 +8,7 @@
 from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import (CompletionRequestOutput, EmbeddingRequestOutput,
-                          RequestOutput)
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import MultiModalData
@@ -324,8 +323,8 @@ def _add_request(
                                     multi_modal_data=multi_modal_data)
 
     def _run_engine(
-        self, use_tqdm: bool
-    ) -> List[Union[CompletionRequestOutput, EmbeddingRequestOutput]]:
+            self, use_tqdm: bool
+    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -336,8 +335,7 @@ def _run_engine(
                 postfix=f"Generation Speed: {0:.2f} toks/s",
             )
         # Run the engine.
-        outputs: List[Union[CompletionRequestOutput,
-                            EmbeddingRequestOutput]] = []
+        outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
         total_toks = 0
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 18bd4e2dfbc1..1b469fc59b07 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -19,7 +19,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.outputs import CompletionRequestOutput
+from vllm.outputs import RequestOutput
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -180,8 +180,8 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
 
     async def chat_completion_stream_generator(
             self, request: ChatCompletionRequest,
-            result_generator: AsyncIterator[CompletionRequestOutput],
-            request_id: str, conversation: List[ConversationMessage]
+            result_generator: AsyncIterator[RequestOutput], request_id: str,
+            conversation: List[ConversationMessage]
     ) -> AsyncGenerator[str, None]:
         model_name = self.served_model_names[0]
         created_time = int(time.time())
@@ -320,13 +320,13 @@ async def chat_completion_stream_generator(
 
     async def chat_completion_full_generator(
         self, request: ChatCompletionRequest, raw_request: Request,
-        result_generator: AsyncIterator[CompletionRequestOutput],
-        request_id: str, conversation: List[ConversationMessage]
+        result_generator: AsyncIterator[RequestOutput], request_id: str,
+        conversation: List[ConversationMessage]
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
 
         model_name = self.served_model_names[0]
         created_time = int(time.time())
-        final_res: Optional[CompletionRequestOutput] = None
+        final_res: Optional[RequestOutput] = None
 
         async for res in result_generator:
             if await raw_request.is_disconnected():
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index e5b11201bc4c..158d8ed7fbbf 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -17,7 +17,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.outputs import CompletionRequestOutput
+from vllm.outputs import RequestOutput
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
@@ -86,7 +86,7 @@ async def create_completion(self, request: CompletionRequest,
         created_time = int(time.time())
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncIterator[CompletionRequestOutput]] = []
+        generators: List[AsyncIterator[RequestOutput]] = []
         try:
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
@@ -130,7 +130,7 @@ async def create_completion(self, request: CompletionRequest,
             return self.create_error_response(str(e))
 
         result_generator: AsyncIterator[Tuple[
-            int, CompletionRequestOutput]] = merge_async_iterators(*generators)
+            int, RequestOutput]] = merge_async_iterators(*generators)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
         # results. In addition, we do not stream the results when use
@@ -150,8 +150,7 @@ async def create_completion(self, request: CompletionRequest,
                                                     num_prompts=len(prompts))
 
         # Non-streaming response
-        final_res_batch: List[
-            Optional[CompletionRequestOutput]] = [None] * len(prompts)
+        final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
         try:
             async for i, res in result_generator:
                 if await raw_request.is_disconnected():
@@ -182,7 +181,7 @@ async def completion_stream_generator(
         self,
         request: CompletionRequest,
         raw_request: Request,
-        result_generator: AsyncIterator[Tuple[int, CompletionRequestOutput]],
+        result_generator: AsyncIterator[Tuple[int, RequestOutput]],
         request_id: str,
         created_time: int,
         model_name: str,
@@ -278,7 +277,7 @@ async def completion_stream_generator(
 
     def request_output_to_completion_response(
         self,
-        final_res_batch: List[CompletionRequestOutput],
+        final_res_batch: List[RequestOutput],
         request: CompletionRequest,
         request_id: str,
         created_time: int,
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 54f11d791bd6..f5e5facf6844 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,5 +1,4 @@
 import time
-from abc import ABC, abstractmethod
 from typing import List, Optional, Union
 
 from vllm.lora.request import LoRARequest
@@ -77,32 +76,7 @@ def __repr__(self) -> str:
                 f"embedding={len(self.embedding)}")
 
 
-class RequestOutput(ABC):
-    """
-    An abstract base class representing the output of a request to the LLM.
-    The request could be a completion request or an embedding request.
-    """
-
-    def __init__(self, request_id: str, prompt_token_ids: List[int],
-                 finished: bool):
-        self.request_id = request_id
-        self.prompt_token_ids = prompt_token_ids
-        self.finished = finished
-
-    @abstractmethod
-    def from_seq_group(cls, seq_group: 'SequenceGroup') -> "RequestOutput":
-        """
-        A class method to initialize a RequestOutput (or its subclasses)
-        instance from a SequenceGroup.
-        """
-        pass
-
-    @abstractmethod
-    def __repr__(self) -> str:
-        pass
-
-
-class CompletionRequestOutput(RequestOutput):
+class RequestOutput:
     """The output data of a completion request to the LLM.
 
     Args:
@@ -127,7 +101,9 @@ def __init__(
         metrics: Optional[RequestMetrics] = None,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
-        super().__init__(request_id, prompt_token_ids, finished)
+        self.request_id = request_id
+        self.prompt_token_ids = prompt_token_ids
+        self.finished = finished
         self.prompt = prompt
         self.prompt_logprobs = prompt_logprobs
         self.outputs = outputs
@@ -135,8 +111,7 @@ def __init__(
         self.lora_request = lora_request
 
     @classmethod
-    def from_seq_group(cls,
-                       seq_group: SequenceGroup) -> "CompletionRequestOutput":
+    def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         if seq_group.sampling_params is None:
             raise ValueError("Sampling parameters are missing in seq_group.")
         seqs = seq_group.get_seqs()
@@ -186,7 +161,7 @@ def from_seq_group(cls,
                    lora_request=seq_group.lora_request)
 
     def __repr__(self) -> str:
-        return (f"CompletionRequestOutput(request_id={self.request_id}, "
+        return (f"RequestOutput(request_id={self.request_id}, "
                 f"prompt={self.prompt!r}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
                 f"prompt_logprobs={self.prompt_logprobs}, "
@@ -196,7 +171,7 @@ def __repr__(self) -> str:
                 f"lora_request={self.lora_request})")
 
 
-class EmbeddingRequestOutput(RequestOutput):
+class EmbeddingRequestOutput:
     """
     The output data of an embedding request to the LLM.
 
@@ -209,7 +184,9 @@ class EmbeddingRequestOutput(RequestOutput):
 
     def __init__(self, request_id: str, outputs: 'EmbeddingOutput',
                  prompt_token_ids: List[int], finished: bool):
-        super().__init__(request_id, prompt_token_ids, finished)
+        self.request_id = request_id
+        self.prompt_token_ids = prompt_token_ids
+        self.finished = finished
         self.outputs = outputs
 
     @classmethod
@@ -248,4 +225,4 @@ def create(seq_group):
                    'embeddings') and seq_group.embeddings is not None:
             return EmbeddingRequestOutput.from_seq_group(seq_group)
         else:
-            return CompletionRequestOutput.from_seq_group(seq_group)
+            return RequestOutput.from_seq_group(seq_group)

From 128dfdd855714076e1bd4d28433465628afdfbe6 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 7 May 2024 11:43:26 -0700
Subject: [PATCH 15/41] Update EmbeddingModelBlockSpaceManager interface

---
 vllm/core/embedding_model_block_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py
index aed7bb9307ff..adbde06dc0c8 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup
@@ -36,7 +36,7 @@ def append_slots(
         self,
         seq: Sequence,
         num_lookahead_slots: int,
-    ) -> Dict[int, List[int]]:
+    ) -> List[Tuple[int, int]]:
         return None  # type: ignore
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:

From 25337de4ed165d7d4a5a28bb8b1af33ac70bf790 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 7 May 2024 14:16:42 -0700
Subject: [PATCH 16/41] Move sentence-transformers to requirements-common.txt

- For AMD Tests
---
 requirements-common.txt | 1 +
 requirements-dev.txt    | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index bd779d5acb68..e064e5e69853 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -18,3 +18,4 @@ lm-format-enforcer == 0.10.1
 outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+sentence-transformers # required for embedding test
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f943ff9645e3..e6d375cbafa3 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -25,7 +25,6 @@ requests
 ray
 peft
 awscli
-sentence-transformers # required for embedding test
 
 # Benchmarking
 aiohttp

From 80ed35825d19b6364c38fd29aea07c6000d59c3e Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 8 May 2024 18:28:03 -0700
Subject: [PATCH 17/41] Fix Models Test and update interface for
 embedding_block_manager

---
 tests/conftest.py                          | 3 ++-
 vllm/core/embedding_model_block_manager.py | 6 +++---
 vllm/entrypoints/openai/api_server.py      | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index d6af7378c8c0..901cb11fb9f7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -162,7 +162,8 @@ def __init__(
             )
         elif model_name in _EMBEDDING_MODELS:
             print("using sentence transformer")
-            self.model = SentenceTransformer(model_name, ).cuda()
+            self.model = SentenceTransformer(
+                model_name, ).to(dtype=torch_dtype).cuda()
         else:
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py
index adbde06dc0c8..a09d79ec3c42 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple
+from typing import List, Tuple
 
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup
@@ -47,13 +47,13 @@ def can_swap_in(self, seq_group: SequenceGroup,
         return AllocStatus.OK
 
     def swap_in(self, seq_group: SequenceGroup,
-                num_lookahead_slots: int) -> Dict[int, int]:
+                num_lookahead_slots: int) -> List[Tuple[int, int]]:
         return None  # type: ignore
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         return True
 
-    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         return None  # type: ignore
 
     def free(self, seq: Sequence) -> None:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b7711d9e783f..7cd51b959a0e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -205,8 +205,8 @@ async def authentication(request: Request, call_next):
                                             args.chat_template)
     openai_serving_completion = OpenAIServingCompletion(
         engine, model_config, served_model_names, args.lora_modules)
-    openai_serving_embedding = OpenAIServingEmbedding(
-        engine, model_config, served_model_names)
+    openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
+                                                      served_model_names)
     app.root_path = args.root_path
     uvicorn.run(app,
                 host=args.host,

From 4936aa54ec9f0364e93b3ea30e0928e57e716231 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 8 May 2024 23:28:39 -0700
Subject: [PATCH 18/41] Rebase

---
 vllm/entrypoints/llm.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 50f6128581b1..25f4428100b2 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -343,10 +343,12 @@ def _run_engine(
                 if output.finished:
                     outputs.append(output)
                     if use_tqdm:
-                        total_toks += (sum(
-                            len(stp.token_ids) for stp in output.outputs))
-                        spd = total_toks / pbar.format_dict["elapsed"]
-                        pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
+                        if isinstance(output, RequestOutput):
+                            # Calculate tokens only for RequestOutput
+                            total_toks += sum(
+                                len(stp.token_ids) for stp in output.outputs)
+                            spd = total_toks / pbar.format_dict["elapsed"]
+                            pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
                         pbar.update(1)
         if use_tqdm:
             pbar.close()

From 30785e633c66273b203105aee133e63732115315 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Thu, 9 May 2024 00:59:33 -0700
Subject: [PATCH 19/41] Fix Models Test

---
 tests/conftest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 901cb11fb9f7..57495d272f96 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -163,7 +163,9 @@ def __init__(
         elif model_name in _EMBEDDING_MODELS:
             print("using sentence transformer")
             self.model = SentenceTransformer(
-                model_name, ).to(dtype=torch_dtype).cuda()
+                model_name,
+                device="cpu",
+            ).to(dtype=torch_dtype).cuda()
         else:
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,

From 1bf8531e24b0bf609611179945440fef3d7aef7a Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 16:55:08 +0000
Subject: [PATCH 20/41] format

---
 requirements-common.txt               |  1 -
 requirements-dev.txt                  |  9 ++++---
 tests/test_sequence.py                | 35 +++------------------------
 vllm/worker/embedding_model_runner.py |  7 ++++--
 4 files changed, 14 insertions(+), 38 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index e064e5e69853..bd779d5acb68 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -18,4 +18,3 @@ lm-format-enforcer == 0.10.1
 outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-sentence-transformers # required for embedding test
diff --git a/requirements-dev.txt b/requirements-dev.txt
index e6d375cbafa3..796c9e37d023 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -19,12 +19,15 @@ pytest-forked
 pytest-asyncio
 pytest-rerunfailures
 pytest-shard
-httpx
+
+# testing utils
+awscli
 einops # required for MPT
+httpx
+peft
 requests
 ray
-peft
-awscli
+sentence-transformers # required for embedding
 
 # Benchmarking
 aiohttp
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 0d6ef2587352..b8ea1f6b7720 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -1,39 +1,10 @@
 import pytest
 
 from tests.core.utils import create_dummy_prompt
-from vllm import SamplingParams
-from vllm.lora.request import LoRARequest
 from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput,
-                           Sequence, SequenceData, SequenceGroup,
-                           SequenceOutput)
-
-
-def create_dummy_prompt(
-    request_id: str,
-    prompt_length: int,
-    block_size: Optional[int] = None,
-    lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
-    best_of: int = 1,
-) -> SequenceGroup:
-    if not block_size:
-        block_size = prompt_length
-
-    # Create dummy prompt sequence with tokens 0...block_size-1
-    # and prompt "0 ... block_size".
-    prompt_tokens = list(range(prompt_length))
-    prompt_str = " ".join([str(t) for t in prompt_tokens])
-    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[prompt],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(
-                                  use_beam_search=use_beam_search,
-                                  best_of=best_of),
-                              lora_request=lora_request)
-    return seq_group
-
- 
+                           SequenceData, SequenceOutput)
+
+
 @pytest.fixture
 def sample_outputs():
     return [
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 70123b674a9f..2d3f160c60dc 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -3,8 +3,9 @@
 import torch
 
 from vllm.attention import AttentionMetadata
-from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
@@ -25,6 +26,7 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        cache_config: CacheConfig,
         load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
@@ -35,6 +37,7 @@ def __init__(
                          parallel_config,
                          scheduler_config,
                          device_config,
+                         cache_config,
                          load_config,
                          lora_config=lora_config,
                          kv_cache_dtype=kv_cache_dtype,

From f4c17a4d07f53becd60abba75a38fec2e68bd181 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 16:57:05 +0000
Subject: [PATCH 21/41] added test_embedding

---
 tests/models/{test_llama_embedding.py => test_embedding.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/models/{test_llama_embedding.py => test_embedding.py} (100%)

diff --git a/tests/models/test_llama_embedding.py b/tests/models/test_embedding.py
similarity index 100%
rename from tests/models/test_llama_embedding.py
rename to tests/models/test_embedding.py

From 39b2973f97433b269a1ed617b0358829de10fb14 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:26:11 +0000
Subject: [PATCH 22/41] added examples

---
 examples/offline_inference.py             |  5 ++---
 examples/offline_inference_distributed.py |  5 ++---
 examples/offline_inference_embedding.py   | 18 ++++++++++++++++++
 examples/openai_embedding_client.py       | 22 ++++++++++++++++++++++
 4 files changed, 44 insertions(+), 6 deletions(-)
 create mode 100644 examples/offline_inference_embedding.py
 create mode 100644 examples/openai_embedding_client.py

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 6ac9446b4102..9b758fa2479f 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -12,9 +12,8 @@
 
 # Create an LLM.
 llm = LLM(model="facebook/opt-125m")
-# Generate texts from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated text, and
-# other information.
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
index aeca8544ccb9..0026b73b92b9 100644
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -24,9 +24,8 @@ def __init__(self):
         self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
 
     def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
-        # Generate texts from the prompts.
-        # The output is a list of RequestOutput objects that contain
-        # the prompt, generated text, and other information.
+        # The output is a list of RequestOutput objects that contain the prompt,
+        # generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)
         prompt = []
         generated_text = []
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
new file mode 100644
index 000000000000..b378db5b10e1
--- /dev/null
+++ b/examples/offline_inference_embedding.py
@@ -0,0 +1,18 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM.
+model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = model.encode(prompts)
+# Print the outputs.
+for output in outputs:
+    print(output.outputs.embedding) # list of 4096 floats
diff --git a/examples/openai_embedding_client.py b/examples/openai_embedding_client.py
new file mode 100644
index 000000000000..7568789f5234
--- /dev/null
+++ b/examples/openai_embedding_client.py
@@ -0,0 +1,22 @@
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+responses = client.embeddings.create(input=[
+    "Hello my name is",
+    "The best thing about vLLM is that it supports many different models"
+], model=model)
+
+for data in responses.data:
+    print(data.embedding) # list of float of len 4096
\ No newline at end of file

From 6bdb32e1dfa5dbc8a657c3c3c57597a6f7a86a77 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:27:37 +0000
Subject: [PATCH 23/41] cleanup

---
 examples/offline_inference_distributed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
index 0026b73b92b9..e4f085fa6665 100644
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -24,6 +24,7 @@ def __init__(self):
         self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
 
     def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
+        # Generate texts from the prompts.
         # The output is a list of RequestOutput objects that contain the prompt,
         # generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)

From 9b7eccc50da1a49e8f4291316de1f5516bf17264 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:28:52 +0000
Subject: [PATCH 24/41] cleanup

---
 examples/offline_inference_embedding.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index b378db5b10e1..aceac44933ff 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -10,8 +10,7 @@
 
 # Create an LLM.
 model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
+# Generate texts from the prompts. The output is a list of EmbeddingRequestOutputs.
 outputs = model.encode(prompts)
 # Print the outputs.
 for output in outputs:

From 55a280e1e30d85779b4ff8f4f9e642fcf46b50df Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:29:43 +0000
Subject: [PATCH 25/41] cleanup

---
 examples/offline_inference_neuron.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
index e4986ad8419c..5ecbbf020ab8 100755
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@@ -26,9 +26,8 @@
     # or explicitly assigned.
     device="neuron",
     tensor_parallel_size=2)
-# Generate texts from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated text, and
-# other information.
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:

From aa5c82a4fded391dd4e52288cdeb6240044b4db0 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:29:59 +0000
Subject: [PATCH 26/41] cleanup

---
 examples/offline_inference_with_prefix.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index 20f4d3bc7f01..7ed0563f14e0 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -26,9 +26,8 @@
 
 generating_prompts = [prefix + prompt for prompt in prompts]
 
-# Generate texts from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated text, and
-# other information.
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
 outputs = llm.generate(generating_prompts, sampling_params)
 # Print the outputs.
 for output in outputs:

From 0e9d79c85f93a388779c97bc3038f2cde33b6bcd Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:30:17 +0000
Subject: [PATCH 27/41] new line

---
 examples/openai_embedding_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/openai_embedding_client.py b/examples/openai_embedding_client.py
index 7568789f5234..20ad103a577c 100644
--- a/examples/openai_embedding_client.py
+++ b/examples/openai_embedding_client.py
@@ -19,4 +19,4 @@
 ], model=model)
 
 for data in responses.data:
-    print(data.embedding) # list of float of len 4096
\ No newline at end of file
+    print(data.embedding) # list of float of len 4096

From cc3224f6806ceadfe29819e825dd65a867dea920 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:40:55 +0000
Subject: [PATCH 28/41] reducing changes

---
 tests/core/test_block_manager.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index a4996b5ab097..e40d86029feb 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -306,11 +306,8 @@ def test_sliding_window_multi_seq():
     assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
 
     parent = Sequence(1, "one two three", [0, 1, 2], block_size)
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[parent],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(),
-                              lora_request=None)
+    seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
+                              None)
     block_manager.allocate(seq_group)
 
     # assert the number of blocks allocated is correct

From af3ef42ff3cd75d239d90ce93b4e494bc9c238cd Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 19:59:19 +0000
Subject: [PATCH 29/41] simplify test changes

---
 tests/core/utils.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/core/utils.py b/tests/core/utils.py
index 8fb13177a2d6..22c1d3826dff 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -22,13 +22,10 @@ def create_dummy_prompt(
     prompt_tokens = list(range(prompt_length))
     prompt_str = " ".join([str(t) for t in prompt_tokens])
     prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[prompt],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(
-                                  use_beam_search=use_beam_search,
-                                  best_of=best_of),
-                              lora_request=lora_request)
+    seq_group = SequenceGroup(
+        request_id, [prompt],
+        SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
+        time.time(), lora_request)
 
     return prompt, seq_group
 

From 45732b73f3f70c3fcfb5a950a1e9f1da7d2390bf Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:01:17 +0000
Subject: [PATCH 30/41] simplify test changes

---
 tests/core/test_block_manager.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index e40d86029feb..9db58e075196 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -142,11 +142,8 @@ def test_append_slot_cow():
     child = prompt.fork(new_seq_id=2)
 
     # Allocate space for the sequence group.
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[prompt, child],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(),
-                              lora_request=time.perf_counter)
+    seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
+                              time.time(), time.perf_counter)
     block_manager.allocate(seq_group)
 
     # Fork and append a new token id. We expect a COW to be scheduled.

From 6e8243f6a1dd8a99261888261781dec3ca47ea4f Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:03:21 +0000
Subject: [PATCH 31/41] simplify test changes

---
 tests/test_sequence.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index b8ea1f6b7720..154136a45503 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -10,8 +10,7 @@ def sample_outputs():
     return [
         CompletionSequenceGroupOutput(samples=[
             SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
-        ],
-                                      prompt_logprobs=None) for i in range(5)
+        ], prompt_logprobs=None) for i in range(5)
     ]
 
 

From acf210bbb9eefca36955fdd2b425745a7b1f2ef5 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:03:39 +0000
Subject: [PATCH 32/41] simplify test changes

---
 tests/test_sequence.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 154136a45503..aad0a6dd7e89 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -33,8 +33,7 @@ def test_sampler_output_getitem(sampler_output, sample_outputs):
 def test_sampler_output_setitem(sampler_output):
     new_output = CompletionSequenceGroupOutput(samples=[
         SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
-    ],
-                                               prompt_logprobs=None)
+    ], prompt_logprobs=None)
     sampler_output[2] = new_output
     assert sampler_output[2] == new_output
 

From 1801636d54790a5dfb27a2e361839fb71460b746 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:06:03 +0000
Subject: [PATCH 33/41] style for setting up embedding mode in model_config

---
 vllm/config.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 03a28accee8c..88b4b157419e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -128,7 +128,7 @@ def __init__(
                                                        served_model_name)
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
-        self.embedding_mode = self._check_embedding_mode()
+        self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
 
@@ -139,6 +139,11 @@ def _verify_tokenizer_mode(self) -> None:
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
                 "either 'auto' or 'slow'.")
         self.tokenizer_mode = tokenizer_mode
+    
+    def _verify_embedding_mode(self) -> None:
+        architectures = getattr(self.hf_config, "architectures", [])
+        self.embedding_mode = any(
+            ModelRegistry.is_embedding_model(arch) for arch in architectures)
 
     def _verify_quantization(self) -> None:
         supported_quantization = [*QUANTIZATION_METHODS]
@@ -214,11 +219,6 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-    def _check_embedding_mode(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return any(
-            ModelRegistry.is_embedding_model(arch) for arch in architectures)
-
     def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",

From d97b64d06b6b92cfd3db42774b308da2210908cd Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:22:27 +0000
Subject: [PATCH 34/41] nit on engineargs

---
 vllm/engine/arg_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 14a751ad2811..163723b4be36 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -445,12 +445,14 @@ def add_cli_args(
             action='store_true',
             help='If set, the prefill requests can be chunked based on the '
             'max_num_batched_tokens.')
+
         parser.add_argument(
             '--speculative-model',
             type=nullable_str,
             default=EngineArgs.speculative_model,
             help=
             'The name of the draft model to be used in speculative decoding.')
+
         parser.add_argument(
             '--num-speculative-tokens',
             type=int,
@@ -495,6 +497,7 @@ def add_cli_args(
                             'corresponding to the chosen load_format. '
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
+
         parser.add_argument(
             "--served-model-name",
             nargs="+",
@@ -509,6 +512,7 @@ def add_cli_args(
             "will also be used in `model_name` tag content of "
             "prometheus metrics, if multiple names provided, metrics"
             "tag will take the first one.")
+
         return parser
 
     @classmethod

From 3655086974a0227b8ad4796c1b089d0c8e77beed Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:33:01 +0000
Subject: [PATCH 35/41] updated comment

---
 vllm/outputs.py  | 10 ++++++----
 vllm/sequence.py |  6 ++++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index f5e5facf6844..f9bce9e683f2 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -102,18 +102,19 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.request_id = request_id
-        self.prompt_token_ids = prompt_token_ids
-        self.finished = finished
         self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
         self.prompt_logprobs = prompt_logprobs
         self.outputs = outputs
+        self.finished = finished
         self.metrics = metrics
         self.lora_request = lora_request
 
     @classmethod
     def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         if seq_group.sampling_params is None:
-            raise ValueError("Sampling parameters are missing in seq_group.")
+            raise ValueError(
+                "Sampling parameters are missing for a CompletionRequest.")
         seqs = seq_group.get_seqs()
         if len(seqs) == 1:
             top_n_seqs = seqs
@@ -193,7 +194,8 @@ def __init__(self, request_id: str, outputs: 'EmbeddingOutput',
     def from_seq_group(cls,
                        seq_group: 'SequenceGroup') -> "EmbeddingRequestOutput":
         if seq_group.embeddings is None:
-            raise ValueError("Embeddings are missing in seq_group.")
+            raise ValueError(
+                "Embeddings are missing in seq_group for EmbeddingRequest.")
         output = EmbeddingOutput(seq_group.embeddings)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 66a6963f4054..5c38454be402 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -404,8 +404,10 @@ class SequenceGroup:
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
         multi_modal_data: Multi modal data associated with the request.
-        embeddings: The embeddings vectors of the prompt of the sequence group.
-        pooling_params: The pooling parameters used to generate the pooling.
+        embeddings: The embeddings vectors of the prompt of the sequence group
+            for an embedding model.
+        pooling_params: The pooling parameters used to generate the pooling
+            for an embedding model.
     """
 
     def __init__(

From 2c6ae8038dbd59d1928b93e97067316393b5e3c7 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:35:51 +0000
Subject: [PATCH 36/41] cleanup

---
 vllm/sequence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 5c38454be402..ecb1c5ab047d 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -818,7 +818,7 @@ def clone(
 
 @dataclass
 class PoolerOutput:
-    """The output from a pooling operation in the Llama model."""
+    """The output from a pooling operation in the embedding model."""
     outputs: List[EmbeddingSequenceGroupOutput]
 
     spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None

From 9303a602a5376ee7c59ac39b990e731b650c9a88 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:41:51 +0000
Subject: [PATCH 37/41] removed change from llama.py

---
 tests/conftest.py                             |  1 -
 vllm/model_executor/models/llama.py           | 38 ------------------
 vllm/model_executor/models/llama_embedding.py | 40 ++++++++++++++++++-
 3 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 57495d272f96..c6eef1f5b13d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -161,7 +161,6 @@ def __init__(
                 torch_dtype=torch_dtype,
             )
         elif model_name in _EMBEDDING_MODELS:
-            print("using sentence transformer")
             self.model = SentenceTransformer(
                 model_name,
                 device="cpu",
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a48b86c53588..f6d7fc8733fc 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -298,44 +298,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-
 
 class LlamaForCausalLM(nn.Module):
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index e8b61636b5dd..54484a3f551f 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -5,6 +5,8 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader)
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import PoolerOutput
@@ -47,5 +49,41 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        self.model.load_weights(weights)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.model.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From 5adda0a4fa071c77cfc862371f2372bd6cb82ed5 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:51:24 +0000
Subject: [PATCH 38/41] final review

---
 vllm/model_executor/pooling_metadata.py |  2 +-
 vllm/sequence.py                        | 42 ++++++++++++-------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index 28eeec0f618f..b86cafce85d1 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -33,7 +33,7 @@ def __repr__(self) -> str:
         return ("PoolingMetadata("
                 f"seq_groups={self.seq_groups}, "
                 f"seq_data={self.seq_data}, "
-                f"prompt_lens={self.prompt_lens}, ")
+                f"prompt_lens={self.prompt_lens})")
 
 
 @dataclass
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ecb1c5ab047d..46ac33b7ecab 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -786,6 +786,27 @@ def __repr__(self) -> str:
             f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})")
 
 
+@dataclass
+class PoolerOutput:
+    """The output from a pooling operation in the embedding model."""
+    outputs: List[EmbeddingSequenceGroupOutput]
+
+    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+
+    def __getitem__(self, idx: int):
+        return self.outputs[idx]
+
+    def __setitem__(self, idx: int, value):
+        self.outputs[idx] = value
+
+    def __len__(self):
+        return len(self.outputs)
+
+    def __eq__(self, other: object):
+        return isinstance(other,
+                          self.__class__) and self.outputs == other.outputs
+
+
 @dataclass
 class ExecuteModelRequest:
     """The model execution request."""
@@ -814,24 +835,3 @@ def clone(
             num_lookahead_slots=self.num_lookahead_slots,
             running_queue_size=self.running_queue_size,
         )
-
-
-@dataclass
-class PoolerOutput:
-    """The output from a pooling operation in the embedding model."""
-    outputs: List[EmbeddingSequenceGroupOutput]
-
-    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
-
-    def __getitem__(self, idx: int):
-        return self.outputs[idx]
-
-    def __setitem__(self, idx: int, value):
-        self.outputs[idx] = value
-
-    def __len__(self):
-        return len(self.outputs)
-
-    def __eq__(self, other: object):
-        return isinstance(other,
-                          self.__class__) and self.outputs == other.outputs

From 8747bf62f052dcf4717af9f4e05d8c5ee64fd62c Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:52:49 +0000
Subject: [PATCH 39/41] final review

---
 examples/offline_inference_embedding.py       | 4 ++--
 examples/openai_embedding_client.py           | 5 +++--
 tests/test_sequence.py                        | 6 ++++--
 vllm/config.py                                | 2 +-
 vllm/model_executor/models/llama_embedding.py | 1 -
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index aceac44933ff..7d5ef128bc8e 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -10,8 +10,8 @@
 
 # Create an LLM.
 model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
-# Generate texts from the prompts. The output is a list of EmbeddingRequestOutputs.
+# Generate embedding. The output is a list of EmbeddingRequestOutputs.
 outputs = model.encode(prompts)
 # Print the outputs.
 for output in outputs:
-    print(output.outputs.embedding) # list of 4096 floats
+    print(output.outputs.embedding)  # list of 4096 floats
diff --git a/examples/openai_embedding_client.py b/examples/openai_embedding_client.py
index 20ad103a577c..b73360fe15a2 100644
--- a/examples/openai_embedding_client.py
+++ b/examples/openai_embedding_client.py
@@ -16,7 +16,8 @@
 responses = client.embeddings.create(input=[
     "Hello my name is",
     "The best thing about vLLM is that it supports many different models"
-], model=model)
+],
+                                     model=model)
 
 for data in responses.data:
-    print(data.embedding) # list of float of len 4096
+    print(data.embedding)  # list of float of len 4096
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index aad0a6dd7e89..b8ea1f6b7720 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -10,7 +10,8 @@ def sample_outputs():
     return [
         CompletionSequenceGroupOutput(samples=[
             SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
-        ], prompt_logprobs=None) for i in range(5)
+        ],
+                                      prompt_logprobs=None) for i in range(5)
     ]
 
 
@@ -33,7 +34,8 @@ def test_sampler_output_getitem(sampler_output, sample_outputs):
 def test_sampler_output_setitem(sampler_output):
     new_output = CompletionSequenceGroupOutput(samples=[
         SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
-    ], prompt_logprobs=None)
+    ],
+                                               prompt_logprobs=None)
     sampler_output[2] = new_output
     assert sampler_output[2] == new_output
 
diff --git a/vllm/config.py b/vllm/config.py
index 88b4b157419e..fab9cfbf41a2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -139,7 +139,7 @@ def _verify_tokenizer_mode(self) -> None:
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
                 "either 'auto' or 'slow'.")
         self.tokenizer_mode = tokenizer_mode
-    
+
     def _verify_embedding_mode(self) -> None:
         architectures = getattr(self.hf_config, "architectures", [])
         self.embedding_mode = any(
diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index 54484a3f551f..471978c44423 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -49,7 +49,6 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)

From 8475e5f97115bcdf4b102f9e0d1212f22c6b92ae Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 10 May 2024 20:54:37 +0000
Subject: [PATCH 40/41] format

---
 vllm/model_executor/models/llama_embedding.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama_embedding.py b/vllm/model_executor/models/llama_embedding.py
index 471978c44423..8f1c77da50d9 100644
--- a/vllm/model_executor/models/llama_embedding.py
+++ b/vllm/model_executor/models/llama_embedding.py
@@ -5,8 +5,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import PoolerOutput

From 570b04a8dd224fb8ff1ea56563fb42a7204e82be Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 11 May 2024 11:52:35 -0400
Subject: [PATCH 41/41] Update conftest.py

---
 tests/conftest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c6eef1f5b13d..b8117a19c75d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,7 +6,6 @@
 import pytest
 import torch
 from PIL import Image
-from sentence_transformers import SentenceTransformer
 from transformers import (AutoModelForCausalLM, AutoProcessor,
                           LlavaForConditionalGeneration)
 
@@ -161,6 +160,8 @@ def __init__(
                 torch_dtype=torch_dtype,
             )
         elif model_name in _EMBEDDING_MODELS:
+            # Lazy init required for AMD CI
+            from sentence_transformers import SentenceTransformer
             self.model = SentenceTransformer(
                 model_name,
                 device="cpu",