From a49f5bbf2f80606c176970fbcc1f7cf7c1b553d7 Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Sat, 28 Dec 2024 12:36:41 +1100 Subject: [PATCH 01/11] sagemaker minimum requirements - /invocations, /ping, sm image uses port 8080 Signed-off-by: Nathan Azrak --- Dockerfile | 10 ++++- vllm/entrypoints/openai/api_server.py | 56 ++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 153bff9cf565f..c981260a00d79 100644 --- a/Dockerfile +++ b/Dockerfile @@ -235,7 +235,9 @@ RUN mv vllm test_docs/ #################### OPENAI API SERVER #################### # openai api server alternative -FROM vllm-base AS vllm-openai + +# define sagemaker first, so it is not default from `docker build` +FROM vllm-base AS vllm-sagemaker # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ @@ -247,5 +249,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV VLLM_USAGE_SOURCE production-docker-image +# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--port", "8080"] + +# the default image is `vllm-openai` which is identical to sagemaker without forcing the port +from vllm-sagemaker as vllm-openai + ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER #################### diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2e45b474237f9..befe2e6ead97e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -16,7 +16,7 @@ from typing import AsyncIterator, Optional, Set, Tuple import uvloop -from fastapi import APIRouter, FastAPI, Request +from fastapi import APIRouter, FastAPI, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -44,11 +44,15 @@ CompletionResponse, DetokenizeRequest, DetokenizeResponse, + EmbeddingChatRequest, + EmbeddingCompletionRequest, EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, LoadLoraAdapterRequest, + PoolingChatRequest, + PoolingCompletionRequest, PoolingRequest, PoolingResponse, ScoreRequest, ScoreResponse, TokenizeRequest, @@ -315,6 +319,12 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) +@router.post("/ping") +async def ping(raw_request: Request) -> Response: + """Ping check. Endpoint required for SageMaker""" + return await health(raw_request) + + @router.post("/tokenize") @with_cancellation async def tokenize(request: TokenizeRequest, raw_request: Request): @@ -488,6 +498,49 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) +TASK_HANDLERS = { + "generate": { + "messages": (ChatCompletionRequest, create_chat_completion), + "default": (CompletionRequest, create_completion), + }, + # should `embed` be a pooling task by default? + "embed": { + "messages": (EmbeddingChatRequest, create_embedding), + "default": (EmbeddingCompletionRequest, create_embedding), + }, + "score": { + "messages": (PoolingChatRequest, create_score), + "default": (PoolingCompletionRequest, create_score), + }, +} + + +@router.post("/invocations") +async def invocations(raw_request: Request): + """ + For SageMaker, routes requests to other handlers based on model `task`. + """ + body = await raw_request.json() + task = raw_request.app.state.task + + if task not in TASK_HANDLERS: + raise HTTPException( + status_code=400, + detail=f"Unsupported task: '{task}' for '/invocations'. " + f"Expected one of {set(TASK_HANDLERS.keys())}" + ) + + handler_config = TASK_HANDLERS[task] + if "messages" in body: + request_model, handler = handler_config["messages"] + else: + request_model, handler = handler_config["default"] + + # this is required since we lose the FastAPI automatic casting + request = request_model.model_validate(body) + return await handler(request, raw_request) + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning( "Torch Profiler is enabled in the API server. This should ONLY be " @@ -694,6 +747,7 @@ def init_app_state( chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, ) + state.task = model_config.task def create_server_socket(addr: Tuple[str, int]) -> socket.socket: From ca96f97f777d61be9bb63401ae9577c7678a026e Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Sat, 28 Dec 2024 12:59:47 +1100 Subject: [PATCH 02/11] Fix spacing for formatter Signed-off-by: Nathan Azrak --- vllm/entrypoints/openai/api_server.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index befe2e6ead97e..f1467d49e15fd 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -527,8 +527,7 @@ async def invocations(raw_request: Request): raise HTTPException( status_code=400, detail=f"Unsupported task: '{task}' for '/invocations'. " - f"Expected one of {set(TASK_HANDLERS.keys())}" - ) + f"Expected one of {set(TASK_HANDLERS.keys())}") handler_config = TASK_HANDLERS[task] if "messages" in body: From b6d12d3e01fac0cbf695c47d5fc98c20a6a1fce3 Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Sat, 28 Dec 2024 15:42:26 +1100 Subject: [PATCH 03/11] Correct score request mapping, remove messages variant as it is not supported Signed-off-by: Nathan Azrak --- vllm/entrypoints/openai/api_server.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f1467d49e15fd..9af9cec452b03 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -509,8 +509,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): "default": (EmbeddingCompletionRequest, create_embedding), }, "score": { - "messages": (PoolingChatRequest, create_score), - "default": (PoolingCompletionRequest, create_score), + "default": (ScoreRequest, create_score), }, } From dc09c67887cbf3eb53615c636a91e73741242a5e Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Sat, 28 Dec 2024 16:24:27 +1100 Subject: [PATCH 04/11] Remove unused imports Signed-off-by: Nathan Azrak --- vllm/entrypoints/openai/api_server.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9af9cec452b03..0f3b61643d1ff 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -51,8 +51,6 @@ EmbeddingResponseData, ErrorResponse, LoadLoraAdapterRequest, - PoolingChatRequest, - PoolingCompletionRequest, PoolingRequest, PoolingResponse, ScoreRequest, ScoreResponse, TokenizeRequest, From b263e40bc897fdff9b104b44bdbb650972056bd2 Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Sat, 28 Dec 2024 19:51:59 +1100 Subject: [PATCH 05/11] Add and handlers Signed-off-by: Nathan Azrak --- vllm/entrypoints/openai/api_server.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0f3b61643d1ff..57ff71d31fb66 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -51,6 +51,8 @@ EmbeddingResponseData, ErrorResponse, LoadLoraAdapterRequest, + PoolingChatRequest, + PoolingCompletionRequest, PoolingRequest, PoolingResponse, ScoreRequest, ScoreResponse, TokenizeRequest, @@ -501,7 +503,6 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): "messages": (ChatCompletionRequest, create_chat_completion), "default": (CompletionRequest, create_completion), }, - # should `embed` be a pooling task by default? "embed": { "messages": (EmbeddingChatRequest, create_embedding), "default": (EmbeddingCompletionRequest, create_embedding), @@ -509,6 +510,14 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): "score": { "default": (ScoreRequest, create_score), }, + "reward": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, + "classify": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, } From 256a35b3c8d065d96a7dc85b0572d03c0eb82463 Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Mon, 30 Dec 2024 10:38:04 +1100 Subject: [PATCH 06/11] Add custom entrypoint mapping env vars to cli args Signed-off-by: Nathan Azrak --- Dockerfile | 16 ++++++++-------- sagemaker-entrypoint.sh | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 sagemaker-entrypoint.sh diff --git a/Dockerfile b/Dockerfile index c981260a00d79..0bdd40db6f4b7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -234,10 +234,8 @@ RUN mv vllm test_docs/ #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### -# openai api server alternative - -# define sagemaker first, so it is not default from `docker build` -FROM vllm-base AS vllm-sagemaker +# base openai image with additional requirements, for any subsequent openai-style images +FROM vllm-base AS vllm-openai-base # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ @@ -249,11 +247,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV VLLM_USAGE_SOURCE production-docker-image -# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--port", "8080"] +# define sagemaker first, so it is not default from `docker build` +FROM vllm-openai-base AS vllm-sagemaker + +RUN chmod +x entrypoint.sh +ENTRYPOINT ["./sagemaker-entrypoint.sh"] -# the default image is `vllm-openai` which is identical to sagemaker without forcing the port -from vllm-sagemaker as vllm-openai +from vllm-openai-base as vllm-openai ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER #################### diff --git a/sagemaker-entrypoint.sh b/sagemaker-entrypoint.sh new file mode 100644 index 0000000000000..4fdb606916a28 --- /dev/null +++ b/sagemaker-entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Define the prefix for environment variables to look for +PREFIX="SM_VLLM_" +ARG_PREFIX="--" + +# Initialize an array for storing the arguments +# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response +ARGS=(--port 8080) + +# Loop through all environment variables +env | grep "^${PREFIX}" | while IFS='=' read -r key value; do + # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes + arg_name=$(echo "${key#${PREFIX}}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + + # Add the argument name and value to the ARGS array + ARGS+=("${ARG_PREFIX}${arg_name}") + if [ -n "$value" ]; then + ARGS+=("$value") + fi +done + +# Pass the collected arguments to the main entrypoint +exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" \ No newline at end of file From 2e8a87f11112c94c888141dae192e82be25a65c8 Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Mon, 30 Dec 2024 15:40:07 +1100 Subject: [PATCH 07/11] use process subsitution instead of pipelining to fix local variable issues Signed-off-by: Nathan Azrak --- sagemaker-entrypoint.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sagemaker-entrypoint.sh b/sagemaker-entrypoint.sh index 4fdb606916a28..75a99ffc1f155 100644 --- a/sagemaker-entrypoint.sh +++ b/sagemaker-entrypoint.sh @@ -9,16 +9,16 @@ ARG_PREFIX="--" ARGS=(--port 8080) # Loop through all environment variables -env | grep "^${PREFIX}" | while IFS='=' read -r key value; do +while IFS='=' read -r key value; do # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes - arg_name=$(echo "${key#${PREFIX}}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') # Add the argument name and value to the ARGS array ARGS+=("${ARG_PREFIX}${arg_name}") if [ -n "$value" ]; then ARGS+=("$value") fi -done +done < <(env | grep "^${PREFIX}") # Pass the collected arguments to the main entrypoint exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" \ No newline at end of file From e9cef7784c23b6e08040a3ebe4b8107a448bcee6 Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Tue, 31 Dec 2024 09:04:42 +1100 Subject: [PATCH 08/11] Copy sagemaker entrypoint to image Signed-off-by: Nathan Azrak --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0bdd40db6f4b7..56c9d0e98494e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -250,7 +250,8 @@ ENV VLLM_USAGE_SOURCE production-docker-image # define sagemaker first, so it is not default from `docker build` FROM vllm-openai-base AS vllm-sagemaker -RUN chmod +x entrypoint.sh +COPY ./sagemaker-entrypoint.sh . +RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] from vllm-openai-base as vllm-openai From b0e62559407c93e169804841b1de5e0871855664 Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Thu, 2 Jan 2025 08:28:03 +1100 Subject: [PATCH 09/11] Make /ping respond to GET and POST Signed-off-by: Nathan Azrak --- vllm/entrypoints/openai/api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 57ff71d31fb66..e8fd3dd0857ac 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -319,7 +319,7 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) -@router.post("/ping") +@router.api_route("/ping", methods=["GET", "POST"]) async def ping(raw_request: Request) -> Response: """Ping check. Endpoint required for SageMaker""" return await health(raw_request) From 312c97eee2429220b24b185bd86d402eb493dc6e Mon Sep 17 00:00:00 2001 From: Nathan Azrak Date: Fri, 3 Jan 2025 09:44:51 +1100 Subject: [PATCH 10/11] Move sagemaker entrypoint to examples and adjust dockerfile Signed-off-by: Nathan Azrak --- Dockerfile | 2 +- sagemaker-entrypoint.sh => examples/sagemaker-entrypoint.sh | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename sagemaker-entrypoint.sh => examples/sagemaker-entrypoint.sh (100%) diff --git a/Dockerfile b/Dockerfile index 56c9d0e98494e..cb4b63f3afeda 100644 --- a/Dockerfile +++ b/Dockerfile @@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image # define sagemaker first, so it is not default from `docker build` FROM vllm-openai-base AS vllm-sagemaker -COPY ./sagemaker-entrypoint.sh . +COPY examples/sagemaker-entrypoint.sh . RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] diff --git a/sagemaker-entrypoint.sh b/examples/sagemaker-entrypoint.sh similarity index 100% rename from sagemaker-entrypoint.sh rename to examples/sagemaker-entrypoint.sh From 09840ed0d1e7431100bc064e09f951c8fc729ce1 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 2 Jan 2025 15:59:05 -0800 Subject: [PATCH 11/11] Update Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index cb4b63f3afeda..088314eb38dbe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -254,7 +254,7 @@ COPY examples/sagemaker-entrypoint.sh . RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] -from vllm-openai-base as vllm-openai +FROM vllm-openai-base AS vllm-openai ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER ####################