Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core][Model] Add simple_model_runner and a new model XLMRobertaForSequenceClassification through multimodal interface #6260

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_xlmroberta.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

# online inference
docker exec cpu-test bash -c "
Expand Down
33 changes: 33 additions & 0 deletions examples/hf_bge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import List, Tuple, Union

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name_or_path = "BAAI/bge-reranker-base"
cache_dir = None
max_length = 512

sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]] = \
[("hello world", "nice to meet you"), ("head north", "head south")]
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
cache_dir=cache_dir)
# XLMRobertaForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
cache_dir=cache_dir)
model = model.to("cuda")
model.eval()

inputs = tokenizer(
sentence_pairs,
padding=True,
truncation=True,
return_tensors='pt',
max_length=max_length,
).to("cuda")

all_scores = []
with torch.no_grad():
logits = model(**inputs, return_dict=True).logits
scores = logits.view(-1, ).float()
all_scores.extend(scores.cpu().numpy().tolist())
print(all_scores)
31 changes: 31 additions & 0 deletions examples/offline_inference_xlmroberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import List, Tuple, Union

from transformers import AutoTokenizer

from vllm import LLM

model = "BAAI/bge-reranker-base"
llm = LLM(model=model, tensor_parallel_size=1)

prompt = "this is a useless prompt."
sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]] = \
[("hello world", "nice to meet you"), ("head north", "head south")]
tokenizer = AutoTokenizer.from_pretrained(model, cache_dir=None)

inputs = tokenizer(
sentence_pairs,
padding=True,
truncation=True,
return_tensors='pt',
max_length=512,
).to("cuda")
outputs = llm.process([{
"prompt": prompt,
"multi_modal_data": {
"xlmroberta": inputs,
}
}],
use_tqdm=False)

for output in outputs:
print(output.outputs.result)
32 changes: 28 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import tempfile
from collections import UserList
from enum import Enum
from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
TypeVar, Union)
from typing import (Any, Callable, Dict, List, Optional, Sequence, Tuple,
TypedDict, TypeVar, Union)

import pytest
import torch
Expand All @@ -25,8 +25,9 @@
from vllm.connections import global_http_connection
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.inputs import (ExplicitEncoderDecoderPrompt, PromptInputs,
TextPrompt, to_enc_dec_tuple_list,
zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.sequence import SampleLogprobs
Expand Down Expand Up @@ -201,6 +202,7 @@ def __init__(
is_embedding_model: bool = False,
is_vision_model: bool = False,
is_encoder_decoder_model: bool = False,
is_simple_model: bool = False,
postprocess_inputs: Callable[[BatchEncoding],
BatchEncoding] = identity,
) -> None:
Expand All @@ -221,6 +223,9 @@ def __init__(
auto_cls = AutoModelForVision2Seq
elif is_encoder_decoder_model:
auto_cls = AutoModelForSeq2SeqLM
elif is_simple_model:
from transformers import AutoModelForSequenceClassification
auto_cls = AutoModelForSequenceClassification
else:
auto_cls = AutoModelForCausalLM

Expand Down Expand Up @@ -513,6 +518,17 @@ def generate_encoder_decoder_greedy_logprobs_limit(
def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
return self.model.encode(prompts)

def process(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
with torch.no_grad():
req_outputs = self.model(input_ids,
attention_mask,
return_dict=True)
return req_outputs

def __enter__(self):
return self

Expand Down Expand Up @@ -711,6 +727,14 @@ def encode(self, prompts: List[str]) -> List[List[float]]:
outputs.append(embedding)
return outputs

def process(
self,
prompts: Union[Union[PromptInputs, Sequence[PromptInputs]],
Optional[Union[str, List[str]]]] = None,
) -> torch.Tensor:
req_outputs = self.model.process(prompts)
return req_outputs

def __enter__(self):
return self

Expand Down
3 changes: 2 additions & 1 deletion tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class MockModelConfig:
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
embedding_mode = False
# refer vllm.model_executor.models.ModelMode
model_mode = False


@dataclass
Expand Down
65 changes: 65 additions & 0 deletions tests/models/test_xlmroberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import List, Optional, Tuple, Type, Union

import pytest
import torch
from transformers import AutoTokenizer

from ..conftest import HfRunner, VllmRunner

models = ["BAAI/bge-reranker-base"]


def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
model: str,
*,
dtype: str,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm."""

prompt = "this is a useless prompt."
sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]] = \
[("hello world", "nice to meet you"), ("head north", "head south")]
tokenizer = AutoTokenizer.from_pretrained(model, cache_dir=None)
inputs = tokenizer(
sentence_pairs,
padding=True,
truncation=True,
return_tensors='pt',
max_length=512,
).to("cuda")

with vllm_runner(model,
dtype=dtype,
max_model_len=512,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.process([{
"prompt": prompt,
"multi_modal_data": {
"xlmroberta": inputs,
}
}])

with hf_runner(model, dtype=dtype, is_simple_model=True) as hf_model:
hf_outputs = hf_model.process(**inputs)

print(vllm_outputs[0].outputs.result, hf_outputs.logits.view(-1, ))
assert torch.allclose(vllm_outputs[0].outputs.result,
hf_outputs.logits.view(-1, ))


@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float"])
def test_models(hf_runner, vllm_runner, model, dtype: str) -> None:
run_test(
hf_runner,
vllm_runner,
model,
dtype=dtype,
tensor_parallel_size=1,
)
42 changes: 17 additions & 25 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models import ModelMode, ModelRegistry
from vllm.platforms import current_platform
from vllm.tracing import is_otel_installed
from vllm.transformers_utils.config import get_config, get_hf_text_config
Expand Down Expand Up @@ -167,6 +167,8 @@ def __init__(
code_revision, rope_scaling, rope_theta)
self.hf_text_config = get_hf_text_config(self.hf_config)
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
architectures = getattr(self.hf_config, "architectures", [])
self.model_mode = ModelRegistry.get_model_mode(architectures)

# Choose a default enforce_eager value if the user did not specify
# a value (enforce_eager is None)
Expand Down Expand Up @@ -217,7 +219,6 @@ def __init__(
limit_mm_per_prompt)
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
self._verify_embedding_mode()
self._verify_quantization()
self._verify_cuda_graph()

Expand All @@ -244,11 +245,6 @@ def _verify_tokenizer_mode(self) -> None:
"either 'auto' or 'slow'.")
self.tokenizer_mode = tokenizer_mode

def _verify_embedding_mode(self) -> None:
architectures = getattr(self.hf_config, "architectures", [])
self.embedding_mode = any(
ModelRegistry.is_embedding_model(arch) for arch in architectures)

def _parse_quant_hf_config(self):
quant_cfg = getattr(self.hf_config, "quantization_config", None)
if quant_cfg is None:
Expand Down Expand Up @@ -496,16 +492,6 @@ def get_multimodal_config(self) -> "MultiModalConfig":

return self.multimodal_config

@property
def is_encoder_decoder_model(self) -> bool:
"""Extract the HF encoder/decoder model flag."""
return getattr(self.hf_config, "is_encoder_decoder", False)

@property
def is_embedding_model(self) -> bool:
"""Extract the embedding model flag."""
return self.embedding_mode


class CacheConfig:
"""Configuration for the KV cache.
Expand Down Expand Up @@ -860,7 +846,8 @@ class SchedulerConfig:
prompt latency) before scheduling next prompt.
enable_chunked_prefill: If True, prefill requests can be chunked based
on the remaining max_num_batched_tokens.
embedding_mode: Whether the running model is for embedding.
model_mode: one of [DECODER, ENCODER, ENCODER_DECODER, EMBEDDING,
SIMPLE]
preemption_mode: Whether to perform preemption by swapping or
recomputation. If not specified, we determine the mode as follows:
We use recomputation by default since it incurs lower overhead than
Expand All @@ -882,7 +869,7 @@ def __init__(self,
num_lookahead_slots: int = 0,
delay_factor: float = 0.0,
enable_chunked_prefill: bool = False,
embedding_mode: Optional[bool] = False,
model_mode: ModelMode = ModelMode.DECODER,
preemption_mode: Optional[str] = None,
num_scheduler_steps: int = 1,
send_delta_data: bool = False) -> None:
Expand All @@ -893,14 +880,19 @@ def __init__(self,
# It is the values that have the best balance between ITL
# and TTFT on A100. Note it is not optimized for throughput.
self.max_num_batched_tokens = 512
elif embedding_mode:
# For embedding, choose specific value for higher throughput
self.max_num_batched_tokens = max(
max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS)
else:
# If max_model_len is too short, use 2048 as the default value
# for higher throughput.
self.max_num_batched_tokens = max(max_model_len, 2048)
max_num_batched_tokens = max(max_model_len, 2048)
max_num_batched_tokens_for_mode = \
ModelMode.get_model_max_num_batched_tokens(model_mode)
if max_num_batched_tokens_for_mode is not None:
max_num_batched_tokens = max(
max_num_batched_tokens,
max_num_batched_tokens_for_mode)

self.max_num_batched_tokens = max_num_batched_tokens

if enable_chunked_prefill:
logger.info(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
Expand All @@ -912,7 +904,7 @@ def __init__(self,
self.num_lookahead_slots = num_lookahead_slots
self.delay_factor = delay_factor
self.chunked_prefill_enabled = enable_chunked_prefill
self.embedding_mode = embedding_mode
self.model_mode = model_mode
self.preemption_mode = preemption_mode
self.num_scheduler_steps = num_scheduler_steps
self.send_delta_data = send_delta_data
Expand Down
19 changes: 0 additions & 19 deletions vllm/core/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,6 @@ class AllocStatus(enum.Enum):

class BlockSpaceManager(ABC):

@staticmethod
def get_block_space_manager_class(version: str):
version = version.lower()

if version == "v1":
from vllm.core.block_manager_v1 import BlockSpaceManagerV1
return BlockSpaceManagerV1

if version == "v2":
from vllm.core.block_manager_v2 import BlockSpaceManagerV2
return BlockSpaceManagerV2

if version == "embedding":
from vllm.core.embedding_model_block_manager import (
EmbeddingModelBlockSpaceManager)
return EmbeddingModelBlockSpaceManager

raise ValueError(f"Unknown version {version=}")

@abstractmethod
def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
pass
Expand Down
15 changes: 5 additions & 10 deletions vllm/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from dataclasses import dataclass, field
from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union

from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
from vllm.config import CacheConfig, LoRAConfig, ModelMode, SchedulerConfig
from vllm.core.interfaces import AllocStatus
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
Expand Down Expand Up @@ -307,14 +307,9 @@ def __init__(
# LoRAs. This should be improved in the future.
self.lora_config = lora_config

version = "v1"
if self.scheduler_config.use_v2_block_manager:
version = "v2"
if self.scheduler_config.embedding_mode:
version = "embedding"

BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
version)
BlockSpaceManagerImpl = ModelMode.get_block_space_manager_impl(
self.scheduler_config.use_v2_block_manager,
self.scheduler_config.model_mode)

num_gpu_blocks = cache_config.num_gpu_blocks
if num_gpu_blocks:
Expand Down
Loading
Loading