diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.rst index ce327abd5fa20..d88da32092924 100644 --- a/docs/source/usage/faq.rst +++ b/docs/source/usage/faq.rst @@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul Q: Which model to use for offline inference embedding? -A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model +A: You can try `e5-mistral-7b-instruct `__ and `BAAI/bge-base-en-v1.5 `__; +more are listed :ref:`here `. + +By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B `__, +`Mistral-7B-Instruct-v0.3 `__ into embedding models, +but they are expected be inferior to models that are specifically trained on embedding tasks. ---------------------------------------- diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 658039bfc3365..534f79b3a60bf 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -14,7 +14,7 @@ from vllm.worker.model_runner import (ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata) -# Placeholder attention backend for models like Mamba and embedding models that +# Placeholder attention backend for models like Mamba and pooling models that # lack attention. diff --git a/vllm/config.py b/vllm/config.py index 2d9a76fe7ddb1..322c8f8990a40 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -152,7 +152,7 @@ class ModelConfig: this argument will be used to configure the neuron config that can not be gathered from the vllm arguments. override_pooler_config: Initialize non default pooling config or - override default pooling config for the embedding model. + override default pooling config for the pooling model. """ def __init__( @@ -576,7 +576,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Async postprocessor is not necessary with embedding mode + # Async postprocessor is not necessary for pooling models # since there is no token generation if self.runner_type == "pooling": self.use_async_output_proc = False @@ -1825,11 +1825,11 @@ class MultiModalConfig: @dataclass class PoolerConfig: - """Controls the behavior of output pooling in embedding models.""" + """Controls the behavior of output pooling in pooling models.""" pooling_type: Optional[str] = None """ - The pooling method of the embedding model. This should be a key in + The pooling method of the pooling model. This should be a key in :class:`vllm.model_executor.layers.pooler.PoolingType`. """ diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index 26d42b7f1790e..a47e594518534 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -8,7 +8,7 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager): """A version of BlockSpaceManager for use in environments where block management is not required. - For example: embedding models or attention-free models like Mamba. + For example: pooling models or attention-free models like Mamba. This class provides the same interface as BlockSpaceManager, but its methods perform no actions or return simple values like True in specific diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d485c2a9e7208..7337522bc9952 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -893,7 +893,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--override-pooler-config', type=PoolerConfig.from_json, default=None, - help="Override or set the pooling method in the embedding model. " + help="Override or set the pooling method for pooling models. " "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'") parser.add_argument('--compilation-config', @@ -1085,7 +1085,7 @@ def create_engine_config(self, "setting --max-model-len to a smaller value.", max_model_len) elif (self.enable_chunked_prefill and model_config.runner_type == "pooling"): - msg = "Chunked prefill is not supported for embedding models" + msg = "Chunked prefill is not supported for pooling models" raise ValueError(msg) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 60dccd7a0812c..32396fd10188d 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1085,7 +1085,7 @@ async def encode( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model. + """Generate outputs for a request from a pooling model. Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index a729023bc00bb..0a046c71e86e8 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -527,7 +527,7 @@ def encode( *, inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model. + """Generate outputs for a request from a pooling model. Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 4079de7d36793..a066836b92708 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -209,7 +209,7 @@ def encode( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from an embedding model.""" + """Generate outputs for a request from a pooling model.""" ... @abstractmethod diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index fed06fa452955..4929e720c00e4 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -119,7 +119,7 @@ async def create_score( if prompt_adapter_request is not None: raise NotImplementedError("Prompt adapter is not supported " - "for embedding models") + "for scoring models") if isinstance(tokenizer, MistralTokenizer): raise ValueError( diff --git a/vllm/sequence.py b/vllm/sequence.py index 669124319c4f4..b0f3c1cc3609f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -618,9 +618,9 @@ class SequenceGroup: arrival_time: The arrival time of the request. lora_request: LoRA request. embeddings: The embeddings vectors of the prompt of the sequence group - for an embedding model. + for a pooling model. pooling_params: The pooling parameters used to generate the pooling - for an embedding model. + for a pooling model. encoder_seq: Optional, the single encoder sequence. Should be None unless you are working with an encoder/decoder model. trace_headers: OpenTelemetry trace headers. @@ -1102,7 +1102,7 @@ class PoolerOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] - """The output from a pooling operation in the embedding model.""" + """The output from a pooling operation in the pooling model.""" outputs: List[EmbeddingSequenceGroupOutput] # lazy import to avoid circular import diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 120fc64969552..e0e525b30a767 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -59,7 +59,7 @@ def process_inputs( priority: int = 0, ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: - # TODO(woosuk): Support embedding mode. + # TODO(woosuk): Support pooling models. # TODO(woosuk): Check max_logprobs # TODO(woosuk): Support encoder-decoder models. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index ba3d4a130a80b..09758a5d9accf 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -178,7 +178,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CPUCacheEngine] - # Initialize cpu_cache as embedding models don't initialize kv_caches + # Initialize cpu_cache as pooling models don't initialize kv_caches self.cpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 493f7a9fad098..cca7cd50bfc7b 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -65,8 +65,8 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[HPUCacheEngine] - # Initialize gpu_cache as embedding models don't initialize kv_caches - self.hpu_cache: Optional[List[List[torch.tensor]]] = None + # Initialize gpu_cache as pooling models don't initialize kv_caches + self.hpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 832b9903b7abc..a368bb9ee9a5b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -91,7 +91,7 @@ def __init__( # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] - # Initialize gpu_cache as embedding models don't initialize kv_caches + # Initialize gpu_cache as pooling models don't initialize kv_caches self.gpu_cache: Optional[List[List[torch.Tensor]]] = None self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}