From 22c286a5187b8dab5a712eeeef64283a261b56b7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 23 May 2024 09:29:52 +0000 Subject: [PATCH 1/8] Move common code inside `ServerRunner` and implement auto port selection --- tests/async_engine/test_openapi_server_ray.py | 31 +++-- tests/entrypoints/test_openai_server.py | 105 ++++++++-------- tests/tensorizer_loader/test_tensorizer.py | 12 +- tests/utils.py | 119 +++++++++++++----- 4 files changed, 155 insertions(+), 112 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 7a8d4b3915617..8af9b96fafc8e 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,16 +4,22 @@ # and debugging. import ray -from ..utils import ServerRunner +from ..utils import VLLM_PATH, ServerRunner # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @pytest.fixture(scope="module") -def server(): - ray.init() - server_runner = ServerRunner.remote([ +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(ray_ctx): + return ServerRunner([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -24,22 +30,15 @@ def server(): "--enforce-eager", "--engine-use-ray" ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +def client(server): + return server.get_async_client() @pytest.mark.asyncio -async def test_check_models(server, client: openai.AsyncOpenAI): +async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data served_model = models[0] @@ -48,7 +47,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_single_completion(server, client: openai.AsyncOpenAI): +async def test_single_completion(client: openai.AsyncOpenAI): completion = await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, @@ -74,7 +73,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_single_chat_session(server, client: openai.AsyncOpenAI): +async def test_single_chat_session(client: openai.AsyncOpenAI): messages = [{ "role": "system", "content": "you are a helpful assistant" diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 2463ccde2bc8b..565d5fa2f316a 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -15,7 +15,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer -from ..utils import ServerRunner +from ..utils import VLLM_PATH, ServerRunner # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -80,9 +80,15 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def server(zephyr_lora_files): - ray.init() - server_runner = ServerRunner.remote([ +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files, ray_ctx): + return ServerRunner([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -90,6 +96,8 @@ def server(zephyr_lora_files): "bfloat16", "--max-model-len", "8192", + "--gpu-memory-utilization", + "0.45", "--enforce-eager", "--gpu-memory-utilization", "0.75", @@ -105,16 +113,11 @@ def server(zephyr_lora_files): "--max-num-seqs", "128", ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def embedding_server(zephyr_lora_files): - ray.shutdown() - ray.init() - server_runner = ServerRunner.remote([ +def embedding_server(ray_ctx): + return ServerRunner([ "--model", EMBEDDING_MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -125,23 +128,24 @@ def embedding_server(zephyr_lora_files): "0.75", "--max-model-len", "8192", + "--gpu-memory-utilization", + "0.45", + "--enforce-eager", ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +def client(server): + return server.get_async_client() @pytest.mark.asyncio -async def test_check_models(server, client: openai.AsyncOpenAI): +@pytest.fixture(scope="module") +def embedding_client(embedding_server): + return embedding_server.get_async_client() + + +async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data served_model = models[0] @@ -158,8 +162,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_single_completion(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): completion = await client.completions.create(model=model_name, prompt="Hello, my name is", max_tokens=5, @@ -190,8 +193,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_zero_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( model=MODEL_NAME, @@ -212,7 +214,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_single_chat_session(server, client: openai.AsyncOpenAI, +async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", @@ -253,8 +255,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_too_many_logprobs(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -313,7 +314,7 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_completion_streaming(server, client: openai.AsyncOpenAI, +async def test_completion_streaming(client: openai.AsyncOpenAI, model_name: str): prompt = "What is an LLM?" @@ -351,8 +352,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_chat_streaming(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -402,8 +402,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_batch_completions(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): # test simple list batch = await client.completions.create( model=model_name, @@ -451,7 +450,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_logits_bias(server, client: openai.AsyncOpenAI): +async def test_logits_bias(client: openai.AsyncOpenAI): prompt = "Hello, my name is" max_tokens = 5 tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) @@ -501,7 +500,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_json_completion(server, client: openai.AsyncOpenAI, +async def test_guided_json_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -524,7 +523,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_json_chat(server, client: openai.AsyncOpenAI, +async def test_guided_json_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -571,7 +570,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, +async def test_guided_regex_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -592,7 +591,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, +async def test_guided_regex_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -630,7 +629,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, +async def test_guided_choice_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -650,7 +649,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, +async def test_guided_choice_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -689,7 +688,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, +async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, guided_decoding_backend: str): with pytest.raises(openai.BadRequestError): _ = await client.completions.create( @@ -725,7 +724,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, +async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -754,7 +753,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_response_format_json_object(server, client: openai.AsyncOpenAI): +async def test_response_format_json_object(client: openai.AsyncOpenAI): for _ in range(2): resp = await client.chat.completions.create( model=MODEL_NAME, @@ -772,7 +771,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_extra_fields(server, client: openai.AsyncOpenAI): +async def test_extra_fields(client: openai.AsyncOpenAI): with pytest.raises(BadRequestError) as exc_info: await client.chat.completions.create( model=MODEL_NAME, @@ -788,7 +787,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_complex_message_content(server, client: openai.AsyncOpenAI): +async def test_complex_message_content(client: openai.AsyncOpenAI): resp = await client.chat.completions.create( model=MODEL_NAME, messages=[{ @@ -808,7 +807,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_custom_role(server, client: openai.AsyncOpenAI): +async def test_custom_role(client: openai.AsyncOpenAI): # Not sure how the model handles custom roles so we just check that # both string and complex message content are handled in the same way @@ -839,7 +838,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_guided_grammar(server, client: openai.AsyncOpenAI): +async def test_guided_grammar(client: openai.AsyncOpenAI): simple_sql_grammar = """ start: select_statement @@ -879,7 +878,7 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI): "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, +async def test_echo_logprob_completion(client: openai.AsyncOpenAI, model_name: str): tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) # test using text and token IDs @@ -906,7 +905,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_long_seed(server, client: openai.AsyncOpenAI): +async def test_long_seed(client: openai.AsyncOpenAI): for seed in [ torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1 @@ -930,14 +929,14 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): "model_name", [EMBEDDING_MODEL_NAME], ) -async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, +async def test_single_embedding(embedding_client: openai.AsyncOpenAI, model_name: str): input = [ "The chef prepared a delicious meal.", ] # test single embedding - embeddings = await client.embeddings.create( + embeddings = await embedding_client.embeddings.create( model=model_name, input=input, encoding_format="float", @@ -969,14 +968,14 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, "model_name", [EMBEDDING_MODEL_NAME], ) -async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, +async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, model_name: str): # test List[str] inputs = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] - embeddings = await client.embeddings.create( + embeddings = await embedding_client.embeddings.create( model=model_name, input=inputs, encoding_format="float", diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 1579d53a7fe29..5fe2cdd80e859 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -6,7 +6,6 @@ import openai import pytest -import ray import torch from vllm import SamplingParams @@ -206,18 +205,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): openai_args = [ "--model", model_ref, "--dtype", "float16", "--load-format", "tensorizer", "--model-loader-extra-config", - json.dumps(model_loader_extra_config), "--port", "8000" + json.dumps(model_loader_extra_config), ] - server = ServerRunner.remote(openai_args) - - assert ray.get(server.ready.remote()) + server = ServerRunner(openai_args) print("Server ready.") - client = openai.OpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) + client = server.get_client() completion = client.completions.create(model=model_ref, prompt="Hello, my name is", max_tokens=5, diff --git a/tests/utils.py b/tests/utils.py index 329842911e159..f871fca874006 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,56 +4,107 @@ import time import warnings from contextlib import contextmanager +from typing import List +import openai import ray import requests from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) +from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.utils import get_open_port # Path to root of repository so that utilities can be imported by ray workers VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) -@ray.remote(num_gpus=1) class ServerRunner: + DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds - def __init__(self, args): - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, + @ray.remote(num_gpus=1) + class _RemoteRunner: + + def __init__(self, cli_args: List[str], *, wait_url: str, + wait_timeout: float) -> None: + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + self.proc = subprocess.Popen( + ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ + + cli_args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + + self._wait_for_server(url=wait_url, timeout=wait_timeout) + + def ready(self): + return True + + def _wait_for_server(self, *, url: str, timeout: float): + # run health check + start = time.time() + while True: + try: + if requests.get(url).status_code == 200: + break + except Exception as err: + if self.proc.poll() is not None: + raise RuntimeError( + "Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError( + "Server failed to start in time.") from err + + def __del__(self): + if hasattr(self, "proc"): + self.proc.terminate() + + def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: + if auto_port: + if "-p" in cli_args or "--port" in cli_args: + raise ValueError("You have manually specified the port" + "when `auto_port=True`.") + + cli_args = cli_args + ["--port", str(get_open_port())] + + parser = make_arg_parser() + args = parser.parse_args(cli_args) + self.host = str(args.host or 'localhost') + self.port = int(args.port) + + self._runner = self._RemoteRunner.remote( + cli_args, + wait_url=self.url_for("health"), + wait_timeout=self.MAX_SERVER_START_WAIT_S) + + self._wait_until_ready() + + @property + def url_root(self) -> str: + return f"http://{self.host}:{self.port}" + + def url_for(self, *parts: str) -> str: + return self.url_root + "/" + "/".join(parts) + + def _wait_until_ready(self) -> None: + ray.get(self._runner.ready.remote()) + + def get_client(self): + return openai.OpenAI( + base_url=self.url_for("v1"), + api_key=self.DUMMY_API_KEY, + ) + + def get_async_client(self): + return openai.AsyncOpenAI( + base_url=self.url_for("v1"), + api_key=self.DUMMY_API_KEY, ) - self._wait_for_server() - - def ready(self): - return True - - def _wait_for_server(self): - # run health check - start = time.time() - while True: - try: - if requests.get( - "http://localhost:8000/health").status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError("Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > self.MAX_SERVER_START_WAIT_S: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() def init_test_distributed_environment( From 3b13e68c5af172c070f774c6680d8b3ee5c0f39b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 May 2024 02:03:40 +0000 Subject: [PATCH 2/8] Rename `ServerRunner -> RemoteOpenAIServer` --- tests/async_engine/test_openapi_server_ray.py | 4 ++-- tests/entrypoints/test_openai_server.py | 6 +++--- tests/tensorizer_loader/test_tensorizer.py | 4 ++-- tests/utils.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 8af9b96fafc8e..b1702a1604da3 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,7 +4,7 @@ # and debugging. import ray -from ..utils import VLLM_PATH, ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @@ -19,7 +19,7 @@ def ray_ctx(): @pytest.fixture(scope="module") def server(ray_ctx): - return ServerRunner([ + return RemoteOpenAIServer([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 565d5fa2f316a..942950c15a655 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -15,7 +15,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer -from ..utils import VLLM_PATH, ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -88,7 +88,7 @@ def ray_ctx(): @pytest.fixture(scope="module") def server(zephyr_lora_files, ray_ctx): - return ServerRunner([ + return RemoteOpenAIServer([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -117,7 +117,7 @@ def server(zephyr_lora_files, ray_ctx): @pytest.fixture(scope="module") def embedding_server(ray_ctx): - return ServerRunner([ + return RemoteOpenAIServer([ "--model", EMBEDDING_MODEL_NAME, # use half precision for speed and memory savings in CI environment diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 5fe2cdd80e859..5a60778bb1863 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -17,7 +17,7 @@ open_stream, serialize_vllm_model) -from ..utils import ServerRunner +from ..utils import RemoteOpenAIServer # yapf conflicts with isort for this docstring @@ -208,7 +208,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): json.dumps(model_loader_extra_config), ] - server = ServerRunner(openai_args) + server = RemoteOpenAIServer(openai_args) print("Server ready.") client = server.get_client() diff --git a/tests/utils.py b/tests/utils.py index f871fca874006..530a67749fc6e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -19,7 +19,7 @@ VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) -class ServerRunner: +class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds From 1cdc3bcbec833f716a32ba78e0038b1871978ef2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 11 Jun 2024 05:57:15 +0000 Subject: [PATCH 3/8] Move embedding tests into its own file --- tests/entrypoints/test_openai_embedding.py | 117 +++++++++++++++++++++ tests/entrypoints/test_openai_server.py | 102 ------------------ 2 files changed, 117 insertions(+), 102 deletions(-) create mode 100644 tests/entrypoints/test_openai_embedding.py diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/test_openai_embedding.py new file mode 100644 index 0000000000000..b54b505a617c9 --- /dev/null +++ b/tests/entrypoints/test_openai_embedding.py @@ -0,0 +1,117 @@ +import openai +import pytest +import ray + +from ..utils import VLLM_PATH, RemoteOpenAIServer + +EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" + +pytestmark = pytest.mark.openai + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def embedding_server(ray_ctx): + return RemoteOpenAIServer([ + "--model", + EMBEDDING_MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--gpu-memory-utilization", + "0.75", + "--max-model-len", + "8192", + "--gpu-memory-utilization", + "0.45", + "--enforce-eager", + ]) + + +@pytest.mark.asyncio +@pytest.fixture(scope="module") +def embedding_client(embedding_server): + return embedding_server.get_async_client() + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [EMBEDDING_MODEL_NAME], +) +async def test_single_embedding(embedding_client: openai.AsyncOpenAI, + model_name: str): + input_texts = [ + "The chef prepared a delicious meal.", + ] + + # test single embedding + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_texts, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 9 + assert embeddings.usage.total_tokens == 9 + + # test using token IDs + input_tokens = [1, 1, 1, 1, 1] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_tokens, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 5 + assert embeddings.usage.total_tokens == 5 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [EMBEDDING_MODEL_NAME], +) +async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, + model_name: str): + # test List[str] + input_texts = [ + "The cat sat on the mat.", "A feline was resting on a rug.", + "Stars twinkle brightly in the night sky." + ] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_texts, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 3 + assert len(embeddings.data[0].embedding) == 4096 + + # test List[List[int]] + input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], + [25, 32, 64, 77]] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_tokens, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 4 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 17 + assert embeddings.usage.total_tokens == 17 diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index a363e377c77fd..e702f207d74d3 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -19,7 +19,6 @@ # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" # technically this needs Mistral-7B-v0.1 as base, but we're not testing # generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" @@ -115,36 +114,11 @@ def server(zephyr_lora_files, ray_ctx): ]) -@pytest.fixture(scope="module") -def embedding_server(ray_ctx): - return RemoteOpenAIServer([ - "--model", - EMBEDDING_MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--enforce-eager", - "--gpu-memory-utilization", - "0.75", - "--max-model-len", - "8192", - "--gpu-memory-utilization", - "0.45", - "--enforce-eager", - ]) - - @pytest.fixture(scope="module") def client(server): return server.get_async_client() -@pytest.mark.asyncio -@pytest.fixture(scope="module") -def embedding_client(embedding_server): - return embedding_server.get_async_client() - - async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data @@ -1393,81 +1367,5 @@ async def test_long_seed(client: openai.AsyncOpenAI): or "less_than_equal" in exc_info.value.message) -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): - input_texts = [ - "The chef prepared a delicious meal.", - ] - - # test single embedding - embeddings = await embedding_client.embeddings.create( - model=model_name, - input=input_texts, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 4096 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 9 - assert embeddings.usage.total_tokens == 9 - - # test using token IDs - input_tokens = [1, 1, 1, 1, 1] - embeddings = await client.embeddings.create( - model=model_name, - input=input_tokens, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 4096 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 5 - assert embeddings.usage.total_tokens == 5 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): - # test List[str] - input_texts = [ - "The cat sat on the mat.", "A feline was resting on a rug.", - "Stars twinkle brightly in the night sky." - ] - embeddings = await embedding_client.embeddings.create( - model=model_name, - input=input_texts, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 3 - assert len(embeddings.data[0].embedding) == 4096 - - # test List[List[int]] - input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], - [25, 32, 64, 77]] - embeddings = await client.embeddings.create( - model=model_name, - input=input_tokens, - encoding_format="float", - ) - assert embeddings.id is not None - assert len(embeddings.data) == 4 - assert len(embeddings.data[0].embedding) == 4096 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 17 - assert embeddings.usage.total_tokens == 17 - - if __name__ == "__main__": pytest.main([__file__]) From 99054b1c9e392649728b7a3d9f131e1d03856b07 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 11 Jun 2024 05:59:07 +0000 Subject: [PATCH 4/8] Update vision tests --- tests/entrypoints/test_openai_vision.py | 33 ++++++++++++------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py index cc03b04e0b0e0..d9143ccddd22e 100644 --- a/tests/entrypoints/test_openai_vision.py +++ b/tests/entrypoints/test_openai_vision.py @@ -8,7 +8,7 @@ from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 -from ..utils import ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer MODEL_NAME = "llava-hf/llava-1.5-7b-hf" LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent / @@ -25,10 +25,16 @@ pytestmark = pytest.mark.openai +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + @pytest.fixture(scope="module") def server(): - ray.init() - server_runner = ServerRunner.remote([ + return RemoteOpenAIServer([ "--model", MODEL_NAME, "--dtype", @@ -47,18 +53,11 @@ def server(): "--chat-template", str(LLAVA_CHAT_TEMPLATE), ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="session") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +def client(server): + return server.get_async_client() @pytest_asyncio.fixture(scope="session") @@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]: @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) -async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, +async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): messages = [{ "role": @@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_single_chat_session_image_base64encoded( - server, client: openai.AsyncOpenAI, model_name: str, image_url: str, + client: openai.AsyncOpenAI, model_name: str, image_url: str, base64_encoded_image: Dict[str, str]): messages = [{ @@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) -async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, +async def test_chat_streaming_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): messages = [{ "role": @@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) -async def test_multi_image_input(server, client: openai.AsyncOpenAI, - model_name: str, image_url: str): +async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, + image_url: str): messages = [{ "role": From f8914b78dfb574e5991c2bc95e288ff4b3ac9f0d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 11 Jun 2024 06:01:33 +0000 Subject: [PATCH 5/8] Use `sys.executable` instead of `python3` alias --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 530a67749fc6e..cce00b773666c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -31,7 +31,7 @@ def __init__(self, cli_args: List[str], *, wait_url: str, env = os.environ.copy() env["PYTHONUNBUFFERED"] = "1" self.proc = subprocess.Popen( - ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ + [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] \ + cli_args, env=env, stdout=sys.stdout, From 3f25d11c4f1e2cd13b39be46f4c342e65b6b9acc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 11 Jun 2024 06:16:48 +0000 Subject: [PATCH 6/8] Simplify code --- tests/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index cce00b773666c..c84364d20fc63 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -31,8 +31,10 @@ def __init__(self, cli_args: List[str], *, wait_url: str, env = os.environ.copy() env["PYTHONUNBUFFERED"] = "1" self.proc = subprocess.Popen( - [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] \ - + cli_args, + [ + sys.executable, "-m", "vllm.entrypoints.openai.api_server", + *cli_args + ], env=env, stdout=sys.stdout, stderr=sys.stderr, From f74456e486bc478e737f6d08e47c84351602a786 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 11 Jun 2024 08:06:07 +0000 Subject: [PATCH 7/8] Remove unnecessary GPU constraint --- tests/entrypoints/test_openai_embedding.py | 4 ---- tests/entrypoints/test_openai_server.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/tests/entrypoints/test_openai_embedding.py b/tests/entrypoints/test_openai_embedding.py index b54b505a617c9..2496d2ac3e97d 100644 --- a/tests/entrypoints/test_openai_embedding.py +++ b/tests/entrypoints/test_openai_embedding.py @@ -25,12 +25,8 @@ def embedding_server(ray_ctx): "--dtype", "bfloat16", "--enforce-eager", - "--gpu-memory-utilization", - "0.75", "--max-model-len", "8192", - "--gpu-memory-utilization", - "0.45", "--enforce-eager", ]) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index e702f207d74d3..2d7e3044d1841 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -95,11 +95,7 @@ def server(zephyr_lora_files, ray_ctx): "bfloat16", "--max-model-len", "8192", - "--gpu-memory-utilization", - "0.45", "--enforce-eager", - "--gpu-memory-utilization", - "0.75", # lora config below "--enable-lora", "--lora-modules", From 0369c752a32cda6401b0cd0656e148adde9e6da4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 11 Jun 2024 15:27:05 +0000 Subject: [PATCH 8/8] Fix wrong scope --- tests/entrypoints/test_openai_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py index d9143ccddd22e..03dc5d1161f0e 100644 --- a/tests/entrypoints/test_openai_vision.py +++ b/tests/entrypoints/test_openai_vision.py @@ -55,7 +55,7 @@ def server(): ]) -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def client(server): return server.get_async_client()