diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d8704d5e2496..187e6e563da6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -15,7 +15,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse from starlette.routing import Mount -from typing_extensions import assert_never +from typing_extensions import assert_never, Annotated import vllm.envs as envs from vllm.config import ModelConfig @@ -247,7 +247,7 @@ async def health() -> Response: @router.post("/tokenize") -async def tokenize(request: TokenizeRequest): +async def tokenize(request: Annotated[dict, TokenizeRequest]): generator = await openai_serving_tokenization.create_tokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -259,7 +259,7 @@ async def tokenize(request: TokenizeRequest): @router.post("/detokenize") -async def detokenize(request: DetokenizeRequest): +async def detokenize(request: Annotated[dict, DetokenizeRequest]): generator = await openai_serving_tokenization.create_detokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -283,7 +283,7 @@ async def show_version(): @router.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest, +async def create_chat_completion(request: Annotated[dict, ChatCompletionRequest], raw_request: Request): generator = await openai_serving_chat.create_chat_completion( @@ -300,7 +300,7 @@ async def create_chat_completion(request: ChatCompletionRequest, @router.post("/v1/completions") -async def create_completion(request: CompletionRequest, raw_request: Request): +async def create_completion(request: Annotated[dict, CompletionRequest], raw_request: Request): generator = await openai_serving_completion.create_completion( request, raw_request) if isinstance(generator, ErrorResponse): @@ -313,7 +313,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): @router.post("/v1/embeddings") -async def create_embedding(request: EmbeddingRequest, raw_request: Request): +async def create_embedding(request: Annotated[dict, EmbeddingRequest], raw_request: Request): generator = await openai_serving_embedding.create_embedding( request, raw_request) if isinstance(generator, ErrorResponse): @@ -351,7 +351,7 @@ async def stop_profile(): "This should ONLY be used for local development!") @router.post("/v1/load_lora_adapter") - async def load_lora_adapter(request: LoadLoraAdapterRequest): + async def load_lora_adapter(request: Annotated[dict, LoadLoraAdapterRequest]): response = await openai_serving_chat.load_lora_adapter(request) if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), @@ -365,7 +365,7 @@ async def load_lora_adapter(request: LoadLoraAdapterRequest): return Response(status_code=200, content=response) @router.post("/v1/unload_lora_adapter") - async def unload_lora_adapter(request: UnloadLoraAdapterRequest): + async def unload_lora_adapter(request: Annotated[dict, UnloadLoraAdapterRequest]): response = await openai_serving_chat.unload_lora_adapter(request) if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(),