vllm-project · AllenDou · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_xlmroberta.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
 # online inference
 docker exec cpu-test bash -c "

diff --git a/examples/hf_bge.py b/examples/hf_bge.py
@@ -0,0 +1,33 @@
+from typing import List, Tuple, Union
+
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+model_name_or_path = "BAAI/bge-reranker-base"
+cache_dir = None
+max_length = 512
+
+sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]] = \
+         [("hello world", "nice to meet you"), ("head north", "head south")]
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
+                                          cache_dir=cache_dir)
+# XLMRobertaForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
+                                                           cache_dir=cache_dir)
+model = model.to("cuda")
+model.eval()
+
+inputs = tokenizer(
+    sentence_pairs,
+    padding=True,
+    truncation=True,
+    return_tensors='pt',
+    max_length=max_length,
+).to("cuda")
+
+all_scores = []
+with torch.no_grad():
+    logits = model(**inputs, return_dict=True).logits
+    scores = logits.view(-1, ).float()
+    all_scores.extend(scores.cpu().numpy().tolist())
+print(all_scores)
diff --git a/examples/offline_inference_xlmroberta.py b/examples/offline_inference_xlmroberta.py
@@ -0,0 +1,31 @@
+from typing import List, Tuple, Union
+
+from transformers import AutoTokenizer
+
+from vllm import LLM
+
+model = "BAAI/bge-reranker-base"
+llm = LLM(model=model, tensor_parallel_size=1)
+
+prompt = "this is a useless prompt."
+sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]] = \
+        [("hello world", "nice to meet you"), ("head north", "head south")]
+tokenizer = AutoTokenizer.from_pretrained(model, cache_dir=None)
+
+inputs = tokenizer(
+    sentence_pairs,
+    padding=True,
+    truncation=True,
+    return_tensors='pt',
+    max_length=512,
+).to("cuda")
+outputs = llm.process([{
+    "prompt": prompt,
+    "multi_modal_data": {
+        "xlmroberta": inputs,
+    }
+}],
+                      use_tqdm=False)
+
+for output in outputs:
+    print(output.outputs.result)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,8 +6,8 @@
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
-                    TypeVar, Union)
+from typing import (Any, Callable, Dict, List, Optional, Sequence, Tuple,
+                    TypedDict, TypeVar, Union)
 
 import pytest
 import torch
@@ -25,8 +25,9 @@
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
-from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+from vllm.inputs import (ExplicitEncoderDecoderPrompt, PromptInputs,
+                         TextPrompt, to_enc_dec_tuple_list,
+                         zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sequence import SampleLogprobs
@@ -201,6 +202,7 @@ def __init__(
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
         is_encoder_decoder_model: bool = False,
+        is_simple_model: bool = False,
         postprocess_inputs: Callable[[BatchEncoding],
                                      BatchEncoding] = identity,
     ) -> None:
@@ -221,6 +223,9 @@ def __init__(
                 auto_cls = AutoModelForVision2Seq
             elif is_encoder_decoder_model:
                 auto_cls = AutoModelForSeq2SeqLM
+            elif is_simple_model:
+                from transformers import AutoModelForSequenceClassification
+                auto_cls = AutoModelForSequenceClassification
             else:
                 auto_cls = AutoModelForCausalLM
 
@@ -513,6 +518,17 @@ def generate_encoder_decoder_greedy_logprobs_limit(
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
 
+    def process(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        with torch.no_grad():
+            req_outputs = self.model(input_ids,
+                                     attention_mask,
+                                     return_dict=True)
+        return req_outputs
+
     def __enter__(self):
         return self
 
@@ -711,6 +727,14 @@ def encode(self, prompts: List[str]) -> List[List[float]]:
             outputs.append(embedding)
         return outputs
 
+    def process(
+        self,
+        prompts: Union[Union[PromptInputs, Sequence[PromptInputs]],
+                       Optional[Union[str, List[str]]]] = None,
+    ) -> torch.Tensor:
+        req_outputs = self.model.process(prompts)
+        return req_outputs
+
     def __enter__(self):
         return self
 

@@ -19,7 +19,8 @@ class MockModelConfig:
     tokenizer_mode = "auto"
     max_model_len = 100
     tokenizer_revision = None
-    embedding_mode = False
+    # refer vllm.model_executor.models.ModelMode
+    model_mode = False
 
 
 @dataclass

@@ -0,0 +1,65 @@
+from typing import List, Optional, Tuple, Type, Union
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from ..conftest import HfRunner, VllmRunner
+
+models = ["BAAI/bge-reranker-base"]
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm."""
+
+    prompt = "this is a useless prompt."
+    sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]] = \
+        [("hello world", "nice to meet you"), ("head north", "head south")]
+    tokenizer = AutoTokenizer.from_pretrained(model, cache_dir=None)
+    inputs = tokenizer(
+        sentence_pairs,
+        padding=True,
+        truncation=True,
+        return_tensors='pt',
+        max_length=512,
+    ).to("cuda")
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=512,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.process([{
+            "prompt": prompt,
+            "multi_modal_data": {
+                "xlmroberta": inputs,
+            }
+        }])
+
+    with hf_runner(model, dtype=dtype, is_simple_model=True) as hf_model:
+        hf_outputs = hf_model.process(**inputs)
+
+    print(vllm_outputs[0].outputs.result, hf_outputs.logits.view(-1, ))
+    assert torch.allclose(vllm_outputs[0].outputs.result,
+                          hf_outputs.logits.view(-1, ))
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_models(hf_runner, vllm_runner, model, dtype: str) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        model,
+        dtype=dtype,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/config.py b/vllm/config.py
@@ -10,7 +10,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models import ModelMode, ModelRegistry
 from vllm.platforms import current_platform
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
@@ -167,6 +167,8 @@ def __init__(
                                     code_revision, rope_scaling, rope_theta)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+        architectures = getattr(self.hf_config, "architectures", [])
+        self.model_mode = ModelRegistry.get_model_mode(architectures)
 
         # Choose a default enforce_eager value if the user did not specify
         # a value (enforce_eager is None)
@@ -217,7 +219,6 @@ def __init__(
             limit_mm_per_prompt)
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
-        self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
 
@@ -244,11 +245,6 @@ def _verify_tokenizer_mode(self) -> None:
                 "either 'auto' or 'slow'.")
         self.tokenizer_mode = tokenizer_mode
 
-    def _verify_embedding_mode(self) -> None:
-        architectures = getattr(self.hf_config, "architectures", [])
-        self.embedding_mode = any(
-            ModelRegistry.is_embedding_model(arch) for arch in architectures)
-
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
         if quant_cfg is None:
@@ -496,16 +492,6 @@ def get_multimodal_config(self) -> "MultiModalConfig":
 
         return self.multimodal_config
 
-    @property
-    def is_encoder_decoder_model(self) -> bool:
-        """Extract the HF encoder/decoder model flag."""
-        return getattr(self.hf_config, "is_encoder_decoder", False)
-
-    @property
-    def is_embedding_model(self) -> bool:
-        """Extract the embedding model flag."""
-        return self.embedding_mode
-
 
 class CacheConfig:
     """Configuration for the KV cache.
@@ -860,7 +846,8 @@ class SchedulerConfig:
             prompt latency) before scheduling next prompt.
         enable_chunked_prefill: If True, prefill requests can be chunked based
             on the remaining max_num_batched_tokens.
-        embedding_mode: Whether the running model is for embedding.
+        model_mode: one of [DECODER, ENCODER, ENCODER_DECODER, EMBEDDING,
+            SIMPLE]
         preemption_mode: Whether to perform preemption by swapping or 
             recomputation. If not specified, we determine the mode as follows:
             We use recomputation by default since it incurs lower overhead than
@@ -882,7 +869,7 @@ def __init__(self,
                  num_lookahead_slots: int = 0,
                  delay_factor: float = 0.0,
                  enable_chunked_prefill: bool = False,
-                 embedding_mode: Optional[bool] = False,
+                 model_mode: ModelMode = ModelMode.DECODER,
                  preemption_mode: Optional[str] = None,
                  num_scheduler_steps: int = 1,
                  send_delta_data: bool = False) -> None:
@@ -893,14 +880,19 @@ def __init__(self,
                 # It is the values that have the best balance between ITL
                 # and TTFT on A100. Note it is not optimized for throughput.
                 self.max_num_batched_tokens = 512
-            elif embedding_mode:
-                # For embedding, choose specific value for higher throughput
-                self.max_num_batched_tokens = max(
-                    max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS)
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.
-                self.max_num_batched_tokens = max(max_model_len, 2048)
+                max_num_batched_tokens = max(max_model_len, 2048)
+                max_num_batched_tokens_for_mode = \
+                    ModelMode.get_model_max_num_batched_tokens(model_mode)
+                if max_num_batched_tokens_for_mode is not None:
+                    max_num_batched_tokens = max(
+                        max_num_batched_tokens,
+                        max_num_batched_tokens_for_mode)
+
+                self.max_num_batched_tokens = max_num_batched_tokens
+
         if enable_chunked_prefill:
             logger.info(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
@@ -912,7 +904,7 @@ def __init__(self,
         self.num_lookahead_slots = num_lookahead_slots
         self.delay_factor = delay_factor
         self.chunked_prefill_enabled = enable_chunked_prefill
-        self.embedding_mode = embedding_mode
+        self.model_mode = model_mode
         self.preemption_mode = preemption_mode
         self.num_scheduler_steps = num_scheduler_steps
         self.send_delta_data = send_delta_data

@@ -23,25 +23,6 @@ class AllocStatus(enum.Enum):
 
 class BlockSpaceManager(ABC):
 
-    @staticmethod
-    def get_block_space_manager_class(version: str):
-        version = version.lower()
-
-        if version == "v1":
-            from vllm.core.block_manager_v1 import BlockSpaceManagerV1
-            return BlockSpaceManagerV1
-
-        if version == "v2":
-            from vllm.core.block_manager_v2 import BlockSpaceManagerV2
-            return BlockSpaceManagerV2
-
-        if version == "embedding":
-            from vllm.core.embedding_model_block_manager import (
-                EmbeddingModelBlockSpaceManager)
-            return EmbeddingModelBlockSpaceManager
-
-        raise ValueError(f"Unknown version {version=}")
-
     @abstractmethod
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         pass

@@ -6,8 +6,8 @@
 from dataclasses import dataclass, field
 from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
-from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+from vllm.config import CacheConfig, LoRAConfig, ModelMode, SchedulerConfig
+from vllm.core.interfaces import AllocStatus
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -307,14 +307,9 @@ def __init__(
         # LoRAs. This should be improved in the future.
         self.lora_config = lora_config
 
-        version = "v1"
-        if self.scheduler_config.use_v2_block_manager:
-            version = "v2"
-        if self.scheduler_config.embedding_mode:
-            version = "embedding"
-
-        BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
-            version)
+        BlockSpaceManagerImpl = ModelMode.get_block_space_manager_impl(
+            self.scheduler_config.use_v2_block_manager,
+            self.scheduler_config.model_mode)
 
         num_gpu_blocks = cache_config.num_gpu_blocks
         if num_gpu_blocks: