Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] vLLM CLI for serving and querying OpenAI compatible server #5090

Merged
merged 36 commits into from
Jul 14, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
533dfa2
simple cli
simon-mo Apr 18, 2024
fa90277
Merge branch 'main' of github.com:vllm-project/vllm
simon-mo Apr 18, 2024
b6f06fa
fix sorting
simon-mo Apr 18, 2024
3c09138
change to positional
simon-mo Apr 18, 2024
01b0fef
fix isort
simon-mo Apr 18, 2024
8d13d0a
changed pos arg name
EthanqX May 25, 2024
e4004e9
started adding complete subparser
EthanqX May 25, 2024
d9606e4
draft complete cli endpoint
EthanqX May 27, 2024
60d58cb
finished complete cli endpoint
EthanqX May 28, 2024
dd031b5
added chat cli endpoint
EthanqX May 29, 2024
fdea667
small fixes
EthanqX May 30, 2024
1979d18
used openai sdk
EthanqX Jun 5, 2024
73ed451
small fix
EthanqX Jun 5, 2024
5aa70b6
adjusted imports
EthanqX Jun 5, 2024
0aff304
handled system prompt
EthanqX Jun 5, 2024
1e4e891
fixed url
EthanqX Jun 5, 2024
1c617b9
Merge branch 'main' of github.com:vllm-project/vllm into new-cli
simon-mo Jun 5, 2024
5c8250b
revert docs changes (shadow launching)
simon-mo Jun 5, 2024
09103b6
refactor code
simon-mo Jun 5, 2024
ae60142
format
simon-mo Jun 5, 2024
807d97f
revert format
simon-mo Jun 5, 2024
09aa92f
fix multiline
simon-mo Jun 5, 2024
f9dde03
Merge branch 'vllm-project:main' into new-cli
EthanqX Jun 5, 2024
00f84dd
removed buffer from complete
EthanqX Jun 6, 2024
6f60716
Merge branch 'main' of github.com:vllm-project/vllm into new-cli
simon-mo Jun 11, 2024
cbd8d8e
wrapper method for old docs
EthanqX Jun 11, 2024
310f473
Merge remote-tracking branch 'origin/main' into new-cli
EthanqX Jun 24, 2024
4913116
support reuse of llm engine to run server
EthanqX Jun 24, 2024
edef04f
arg parser for test utils
EthanqX Jun 26, 2024
9e19be7
Merge 'origin/main' into new-cli
EthanqX Jul 1, 2024
563ec6d
format
EthanqX Jul 1, 2024
824b5d9
format
EthanqX Jul 2, 2024
3dd1b75
delete check for model flag in serve
EthanqX Jul 12, 2024
e93d59a
Merge branch 'main' of github.com:vllm-project/vllm into new-cli
EthanqX Jul 12, 2024
53b6d1e
use FlexibleArgumentParser
EthanqX Jul 13, 2024
8cf2257
isort
EthanqX Jul 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

On the server side, run one of the following commands:
vLLM OpenAI API server
python -m vllm.entrypoints.openai.api_server \
--model <your_model> --swap-space 16 \
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests

(TGI backend)
Expand Down
5 changes: 5 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,4 +430,9 @@ def _read_requirements(filename: str) -> List[str]:
},
cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
package_data=package_data,
entry_points={
"console_scripts": [
"vllm=vllm.scripts:main",
],
},
)
1 change: 0 additions & 1 deletion tests/entrypoints/test_openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ def zephyr_lora_files():
def server(zephyr_lora_files):
ray.init()
server_runner = ServerRunner.remote([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand Down
3 changes: 1 addition & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ def __init__(self, args):
env = os.environ.copy()
env["PYTHONUNBUFFERED"] = "1"
self.proc = subprocess.Popen(
[sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
args,
["vllm", "serve"] + args,
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
Expand Down
68 changes: 45 additions & 23 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import argparse
import asyncio
import importlib
import inspect
Expand All @@ -8,7 +9,7 @@

import fastapi
import uvicorn
from fastapi import Request
from fastapi import APIRouter, Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
Expand All @@ -32,6 +33,9 @@

TIMEOUT_KEEP_ALIVE = 5 # seconds

logger = init_logger(__name__)
engine: AsyncLLMEngine
engine_args: AsyncEngineArgs
openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
openai_serving_embedding: OpenAIServingEmbedding
Expand All @@ -57,47 +61,35 @@ async def _force_log():
yield


app = fastapi.FastAPI(lifespan=lifespan)


def parse_args():
parser = make_arg_parser()
return parser.parse_args()

router = APIRouter()

# Add prometheus asgi middleware to route /metrics requests
route = Mount("/metrics", make_asgi_app())
# Workaround for 307 Redirect for /metrics
route.path_regex = re.compile('^/metrics(?P<path>.*)$')
app.routes.append(route)

router.routes.append(route)

@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc):
err = openai_serving_chat.create_error_response(message=str(exc))
return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)


@app.get("/health")
@router.get("/health")
async def health() -> Response:
"""Health check."""
await openai_serving_chat.engine.check_health()
return Response(status_code=200)


@app.get("/v1/models")
@router.get("/v1/models")
async def show_available_models():
models = await openai_serving_chat.show_available_models()
return JSONResponse(content=models.model_dump())


@app.get("/version")
@router.get("/version")
async def show_version():
ver = {"version": vllm.__version__}
return JSONResponse(content=ver)


@app.post("/v1/chat/completions")
@router.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest,
raw_request: Request):
generator = await openai_serving_chat.create_chat_completion(
Expand All @@ -113,7 +105,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
return JSONResponse(content=generator.model_dump())


@app.post("/v1/completions")
@router.post("/v1/completions")
async def create_completion(request: CompletionRequest, raw_request: Request):
generator = await openai_serving_completion.create_completion(
request, raw_request)
Expand All @@ -127,7 +119,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump())


@app.post("/v1/embeddings")
@router.post("/v1/embeddings")
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
generator = await openai_serving_embedding.create_embedding(
request, raw_request)
Expand All @@ -138,8 +130,10 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump())


if __name__ == "__main__":
args = parse_args()
def build_app(args):
app = fastapi.FastAPI(lifespan=lifespan)
app.include_router(router)
app.root_path = args.root_path

app.add_middleware(
CORSMiddleware,
Expand All @@ -149,6 +143,12 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
allow_headers=args.allowed_headers,
)

@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc):
err = openai_serving_chat.create_error_response(message=str(exc))
return JSONResponse(err.model_dump(),
status_code=HTTPStatus.BAD_REQUEST)

if token := envs.VLLM_API_KEY or args.api_key:

@app.middleware("http")
Expand All @@ -174,6 +174,12 @@ async def authentication(request: Request, call_next):
raise ValueError(f"Invalid middleware {middleware}. "
f"Must be a function or a class.")

return app


def run_server(args):
Copy link
Contributor

@prashantgupta24 prashantgupta24 Jun 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to make engine as an optional arg to this function?

Suggested change
def run_server(args):
def run_server(args, llm_engine=None)

This can help external applications reuse the llm engine and attach other API interfaces (like grpc) to the same llm engine. To be used with the other suggestion of changing line 204 to:

engine = (llm_engine
              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, this would be useful.

app = build_app(args)

logger.info("vLLM API server version %s", vllm.__version__)
logger.info("args: %s", args)

Expand All @@ -182,6 +188,8 @@ async def authentication(request: Request, call_next):
else:
served_model_names = [args.model]

global engine, engine_args

engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
Expand All @@ -200,6 +208,10 @@ async def authentication(request: Request, call_next):
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())

global openai_serving_chat
global openai_serving_completion
global openai_serving_embedding

openai_serving_chat = OpenAIServingChat(engine, model_config,
served_model_names,
args.response_role,
Expand All @@ -219,3 +231,13 @@ async def authentication(request: Request, call_next):
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs)


if __name__ == "__main__":
# NOTE(simon):
# This section should be in sync with vllm/scripts.py for CLI entrypoints.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this note true? They seem to be different? (also in this case, should we have a common main method to share?)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In sync in their usage of make_arg_parser

parser = argparse.ArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server.")
parser = make_arg_parser(parser)
args = parser.parse_args()
run_server(args)
5 changes: 2 additions & 3 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, lora_list)


def make_arg_parser():
parser = argparse.ArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server.")
def make_arg_parser(
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument("--host",
type=nullable_str,
default=None,
Expand Down
157 changes: 157 additions & 0 deletions vllm/scripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# The CLI entrypoint to vLLM.
import argparse
import os
import signal
import sys
from typing import Optional

from openai import OpenAI

from vllm.entrypoints.openai.api_server import run_server
from vllm.entrypoints.openai.cli_args import make_arg_parser


def registrer_signal_handlers():

def signal_handler(sig, frame):
sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTSTP, signal_handler)


def serve(args: argparse.Namespace) -> None:
# EngineArgs expects the model name to be passed as --model.
if args.model is not None and args.model == args.model_tag:
raise ValueError(
"The --model argument is not supported for the serve command. "
"Use positional argument [model_tag] instead.")
args.model = args.model_tag

run_server(args)


def interactive_cli(args: argparse.Namespace) -> None:
registrer_signal_handlers()

base_url = args.url
api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
openai_client = OpenAI(api_key=api_key, base_url=base_url)

if args.model_name:
model_name = args.model_name
else:
available_models = openai_client.models.list()
model_name = available_models.data[0].id

print(f"Using model: {model_name}")

if args.command == "complete":
complete(model_name, openai_client)
elif args.command == "chat":
chat(args.system_prompt, model_name, openai_client)


def complete(model_name: str, client: OpenAI) -> None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QQ: do we support setting some sampling params?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Sang's question is whether or not we should support setting sampling params through complete and chat; for now it's using all default parameters.

        completion = client.completions.create(model=model_name,
                                               prompt=input_prompt)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, it would require checking field by field whether any sampling params are provided to override the default value.

print("Please enter prompt to complete:")
while True:
input_prompt = input("> ")

completion = client.completions.create(model=model_name,
prompt=input_prompt)
output = completion.choices[0].text
print(output)


def chat(system_prompt: Optional[str], model_name: str,
client: OpenAI) -> None:
conversation = []
if system_prompt is not None:
conversation.append({"role": "system", "content": system_prompt})

print("Please enter a message for the chat model:")
while True:
input_message = input("> ")
message = {"role": "user", "content": input_message}
conversation.append(message)

chat_completion = client.chat.completions.create(model=model_name,
messages=conversation)

response_message = chat_completion.choices[0].message
output = response_message.content

conversation.append(response_message)
print(output)


def _add_query_options(
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument(
"--url",
type=str,
default="http://localhost:8000/v1",
help="url of the running OpenAI-Compatible RESTful API server")
parser.add_argument(
"--model-name",
type=str,
default=None,
help=("The model name used in prompt completion, default to "
"the first model in list models API call."))
parser.add_argument(
"--api-key",
type=str,
default=None,
help=(
"API key for OpenAI services. If provided, this api key "
"will overwrite the api key obtained through environment variables."
))
return parser


def main():
parser = argparse.ArgumentParser(description="vLLM CLI")
subparsers = parser.add_subparsers(required=True)

serve_parser = subparsers.add_parser(
"serve",
help="Start the vLLM OpenAI Compatible API server",
usage="vllm serve <model_tag> [options]")
serve_parser.add_argument("model_tag",
type=str,
help="The model tag to serve")
serve_parser = make_arg_parser(serve_parser)
serve_parser.set_defaults(dispatch_function=serve)

complete_parser = subparsers.add_parser(
"complete",
help=("Generate text completions based on the given prompt "
"via the running API server"),
usage="vllm complete [options]")
_add_query_options(complete_parser)
complete_parser.set_defaults(dispatch_function=interactive_cli,
command="complete")

chat_parser = subparsers.add_parser(
"chat",
help="Generate chat completions via the running API server",
usage="vllm chat [options]")
_add_query_options(chat_parser)
chat_parser.add_argument(
"--system-prompt",
type=str,
default=None,
help=("The system prompt to be added to the chat template, "
"used for models that support system prompts."))
chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
Comment on lines +123 to +143
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add some documentation on how to use these (can be in a separate PR) if we're planning to release this feature.


args = parser.parse_args()
# One of the sub commands should be executed.
if hasattr(args, "dispatch_function"):
args.dispatch_function(args)
else:
parser.print_help()


if __name__ == "__main__":
main()
Loading