diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4139eca9c1832..8e81d89560377 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -784,6 +784,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: }, default=None, help="override or set neuron device configuration.") + parser.add_argument( + "--disable-fastapi-docs", + action='store_true', + default=False, + help="Disable OpenAPI schema, Swagger UI, and ReDoc documentation") return parser diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 6127177b4d889..88f103b4cbc06 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -11,7 +11,7 @@ from argparse import Namespace from typing import Any, AsyncGenerator, Optional -from fastapi import FastAPI, Request +from fastapi import APIRouter, FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse from vllm.engine.arg_utils import AsyncEngineArgs @@ -27,17 +27,17 @@ logger = init_logger("vllm.entrypoints.api_server") TIMEOUT_KEEP_ALIVE = 5 # seconds. -app = FastAPI() +router = APIRouter() engine = None -@app.get("/health") +@router.get("/health") async def health() -> Response: """Health check.""" return Response(status_code=200) -@app.post("/generate") +@router.post("/generate") async def generate(request: Request) -> Response: """Generate completion for the request. @@ -88,8 +88,11 @@ async def stream_results() -> AsyncGenerator[bytes, None]: def build_app(args: Namespace) -> FastAPI: - global app - + if args.disable_fastapi_docs: + app = FastAPI(openapi_url=None, docs_url=None, redoc_url=None) + else: + app = FastAPI() + app.include_router(router) app.root_path = args.root_path return app diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3d1d832986c1e..b891debfd2b91 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -417,7 +417,13 @@ async def unload_lora_adapter(request: UnloadLoraAdapterRequest, def build_app(args: Namespace) -> FastAPI: - app = FastAPI(lifespan=lifespan) + if args.disable_fastapi_docs: + app = FastAPI(openapi_url=None, + docs_url=None, + redoc_url=None, + lifespan=lifespan) + else: + app = FastAPI(lifespan=lifespan) app.include_router(router) app.root_path = args.root_path