vllm-project · maxdebayser · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -739,6 +739,9 @@ void paged_attention_v1_launcher(
     // NOTE(woosuk): To reduce the compilation time, we only compile for the
     // head sizes that we use in the model. However, we can easily extend this
     // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V1(32);
+      break;
     case 64:
       LAUNCH_PAGED_ATTENTION_V1(64);
       break;
@@ -903,6 +906,9 @@ void paged_attention_v2_launcher(
     // NOTE(woosuk): To reduce the compilation time, we only compile for the
     // head sizes that we use in the model. However, we can easily extend this
     // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V2(32);
+      break;
     case 64:
       LAUNCH_PAGED_ATTENTION_V2(64);
       break;

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
@@ -375,6 +375,9 @@ void paged_attention_v1_impl_launcher(
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
+    case 32:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
+      break;
     case 64:
       LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
       break;
@@ -692,6 +695,9 @@ void paged_attention_v2_impl_launcher(
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
+    case 32:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
+      break;
     case 64:
       LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
       break;

diff --git a/examples/offline_inference_bert_embedding.py b/examples/offline_inference_bert_embedding.py
@@ -0,0 +1,16 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "This is an example sentence.",
+    "Another example sentence.",
+]
+
+# Create an LLM.
+model = LLM(model="bert-base-uncased", enforce_eager=True)
+outputs = model.encode(prompts)
+
+# Print the outputs.
+for output in outputs:
+    print(output.outputs.embedding)  # list of 768 floats
+    print(len(output.outputs.embedding))
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
@@ -1,15 +1,22 @@
 from vllm import LLM
+from vllm.inputs import build_decoder_prompts
 
 # Sample prompts.
-prompts = [
+prompts = build_decoder_prompts([
     "Hello, my name is",
     "The president of the United States is",
     "The capital of France is",
     "The future of AI is",
-]
+])
 
 # Create an LLM.
-model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
+model = LLM(
+    model="intfloat/e5-mistral-7b-instruct",
+    enforce_eager=True,
+    # NOTE: sliding_window is not supported by encoder_decoder_model
+    disable_sliding_window=True,
+    gpu_memory_utilization=0.95,
+)
 # Generate embedding. The output is a list of EmbeddingRequestOutputs.
 outputs = model.encode(prompts)
 # Print the outputs.

@@ -6,9 +6,22 @@
 import torch
 import torch.nn.functional as F
 
+from vllm.inputs import build_decoder_prompts
+
 MODELS = [
-    "intfloat/e5-mistral-7b-instruct",
-    "BAAI/bge-multilingual-gemma2",
+    {
+        "name": "intfloat/e5-mistral-7b-instruct",
+        "is_decoder_only": True
+    },
+    {
+        "name": "BAAI/bge-multilingual-gemma2",
+        "is_decoder_only": True
+    },
+    {
+        "name": "bert-base-uncased",
+        "is_decoder_only": False,
+        "max_model_len": 512
+    },
 ]
 
 
@@ -26,7 +39,7 @@ def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
-    model: str,
+    model: dict,
     dtype: str,
 ) -> None:
     # The example_prompts has ending "\n", for example:
@@ -37,11 +50,22 @@ def test_models(
     # So we need to strip the input texts to avoid test failing.
     example_prompts = [str(s).strip() for s in example_prompts]
 
-    with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
+    model_name = model["name"]
+    is_decoder_only = model["is_decoder_only"]
+    max_model_len = model.get("max_model_len", 1024)
+    with hf_runner(model_name, dtype=dtype,
+                   is_embedding_model=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+    with vllm_runner(
+            model_name,
+            dtype=dtype,
+            disable_sliding_window=True,
+            max_model_len=max_model_len,
+    ) as vllm_model:
+        prompt_inputs = build_decoder_prompts(
+            example_prompts) if is_decoder_only else example_prompts
+        vllm_outputs = vllm_model.encode(prompt_inputs)
 
     similarities = compare_embeddings(hf_outputs, vllm_outputs)
     all_similarities = torch.stack(similarities)

diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -34,7 +34,7 @@ class PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 120, 128, 192, 256]
+        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
 
     @staticmethod
     def get_kv_cache_shape(

diff --git a/vllm/config.py b/vllm/config.py
@@ -566,6 +566,10 @@ def is_encoder_decoder_model(self) -> bool:
             (hasattr(self.hf_config, "text_config") and getattr(
                 self.hf_config.text_config, "is_encoder_decoder", False)))
 
+    @property
+    def is_encoder_model(self) -> bool:
+        return ModelRegistry.is_encoder_model(self.hf_config.architectures)
+
     @property
     def is_embedding_model(self) -> bool:
         """Extract the embedding model flag."""

@@ -63,9 +63,16 @@ def free(self, seq: Sequence) -> None:
         # No operation on free
         return
 
+    def free_cross(self, seq: Sequence) -> None:
+        # No operation on free
+        return
+
     def get_block_table(self, seq: Sequence) -> List[int]:
         return None  # type: ignore
 
+    def get_cross_block_table(self, seq: Sequence) -> List[int]:
+        return None  # type: ignore
+
     def get_num_free_gpu_blocks(self) -> int:
         return 1
 

diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
@@ -1,7 +1,8 @@
 from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt,
                    LLMInputs, PromptType, SingletonPrompt, TextPrompt,
-                   TokensPrompt, build_explicit_enc_dec_prompt,
-                   to_enc_dec_tuple_list, zip_enc_dec_prompts)
+                   TokensPrompt, build_decoder_prompt, build_decoder_prompts,
+                   build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
+                   zip_enc_dec_prompts)
 from .registry import InputContext, InputRegistry
 
 INPUT_REGISTRY = InputRegistry()
@@ -21,6 +22,8 @@
     "ExplicitEncoderDecoderPrompt",
     "LLMInputs",
     "EncoderDecoderLLMInputs",
+    "build_decoder_prompt",
+    "build_decoder_prompts",
     "build_explicit_enc_dec_prompt",
     "to_enc_dec_tuple_list",
     "zip_enc_dec_prompts",

diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
@@ -228,6 +228,18 @@ def to_enc_dec_tuple_list(
             for enc_dec_prompt in enc_dec_prompts]
 
 
+def build_decoder_prompt(
+    prompt: _T2, ) -> ExplicitEncoderDecoderPrompt[SingletonPrompt, _T2]:
+    return build_explicit_enc_dec_prompt(encoder_prompt="",
+                                         decoder_prompt=prompt)
+
+
+def build_decoder_prompts(
+    prompts: Iterable[_T2],
+) -> List[ExplicitEncoderDecoderPrompt[SingletonPrompt, _T2]]:
+    return [build_decoder_prompt(prompt) for prompt in prompts]
+
+
 def __getattr__(name: str):
     if name == "PromptInput":
         import warnings

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
@@ -25,6 +25,7 @@
 DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
                                 Optional["MultiModalDataDict"],
                                 Optional[Dict[str, Any]]]
+_DEFAULT_BOS_TOKEN_ID = 1
 
 
 class InputPreprocessor:
@@ -54,7 +55,13 @@ def get_bos_token_id(self,
                            "is not initialized")
             return None
 
-        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
+        bos_token_id = self.tokenizer.get_lora_tokenizer(
+            lora_request).bos_token_id
+
+        if bos_token_id is None and self.model_config.is_encoder_model:
+            bos_token_id = _DEFAULT_BOS_TOKEN_ID
+
+        return bos_token_id
 
     def get_eos_token_id(self,
                          lora_request: Optional[LoRARequest] = None
@@ -86,9 +93,10 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            print_warning_once("Falling back on <BOS> for decoder start token "
-                               "id because decoder start token id is not "
-                               "available.")
+            if not self.model_config.is_encoder_model:
+                logger.warning(
+                    "Falling back on <BOS> for decoder start token id "
+                    "because decoder start token id is not available.")
             dec_start_token_id = self.get_bos_token_id()
 
         return dec_start_token_id
@@ -577,4 +585,5 @@ async def preprocess_async(
         )
 
     def is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
+        return self.model_config.is_encoder_decoder_model \
+            or self.model_config.is_encoder_model
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
@@ -12,6 +12,7 @@ class PoolingType(IntEnum):
     """Enumeration for different types of pooling methods."""
     LAST = 0
     ALL = 1
+    MEAN = 2
 
 
 class Pooler(nn.Module):
@@ -50,6 +51,17 @@ def forward(
             for prompt_len in prompt_lens:
                 pooled_data.append(hidden_states[offset:offset + prompt_len])
                 offset += prompt_len
+        elif self.pooling_type == PoolingType.MEAN:
+            # Calculate mean pooling
+            cumsum = torch.cumsum(hidden_states, dim=0)
+            start_indices = torch.cat([
+                torch.tensor([0], device=hidden_states.device),
+                torch.cumsum(prompt_lens[:-1], dim=0)
+            ])
+            end_indices = torch.cumsum(prompt_lens, dim=0)
+            pooled_data = (
+                cumsum[end_indices - 1] - cumsum[start_indices] +
+                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
         else:
             raise ValueError(f"Invalid pooling type: {self.pooling_type}")