From bac2c901b48fbce06cb1b41fdf63d38791bb4777 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 12:12:27 +0200 Subject: [PATCH 01/18] add readiness and liveness k8s probes for openai api_server --- vllm/entrypoints/openai/api_server.py | 31 ++++++++++++++++++++++++++- vllm/entrypoints/openai/protocol.py | 28 ++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0fe4dd245b5e..13afbc58a295 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -30,7 +30,9 @@ DetokenizeResponse, EmbeddingRequest, ErrorResponse, TokenizeRequest, - TokenizeResponse) + TokenizeResponse, + ResponseLiveness, + ResponseReadyness) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -89,6 +91,33 @@ async def health() -> Response: await openai_serving_chat.engine.check_health() return Response(status_code=200) +@router.get( + "/liveness", + response_model=ResponseLiveness, + name="liveness", + tags=["technical"], +) +async def get_liveness() -> ResponseLiveness: + """Liveness probe for k8s""" + liveness_msg = ResponseLiveness(alive="ok") + return liveness_msg + + +@router.get( + "/readiness", + response_model=ResponseReadiness, + name="readiness", + tags=["technical"], +) +async def get_readiness() -> ResponseReadiness: + """Readiness probe for k8s""" + model_weights = await openai_serving_chat.engine.engine.model_executor.driver_worker.model_runner.model_memory_usage + + if model_weights: + return ResponseReadiness(ready="ok") + else: + return ResponseReadiness(ready="ko") + @router.post("/tokenize") async def tokenize(request: TokenizeRequest): diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3b35ae1ebd70..a974b8d4cba0 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -720,3 +720,31 @@ class DetokenizeRequest(OpenAIBaseModel): class DetokenizeResponse(OpenAIBaseModel): prompt: str + +class ResponseLiveness(OpenAIBaseModel): + """Return object for liveness probe""" + + alive: str = Field(None, title="Alive message") + model_config = { + "json_schema_extra": { + "examples": [ + "liveness": { + "alive": "ok" + } + ] + } + } + +class ResponseReadiness(OpenAIBaseModel): + """Return object for readiness probe""" + + ready: str = Field(None, title="Ready message") + model_config = { + "json_schema_extra": { + "examples": [ + "readiness": { + "ready": "ok" + } + ] + } + } \ No newline at end of file From d65bf583df23c69488fea6ef691c9d4a65df43fd Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:06:34 +0200 Subject: [PATCH 02/18] update naming for pydantic classes from openai protocol --- vllm/entrypoints/openai/api_server.py | 4 ++-- vllm/entrypoints/openai/protocol.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 13afbc58a295..875049e362ff 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -31,8 +31,8 @@ EmbeddingRequest, ErrorResponse, TokenizeRequest, TokenizeResponse, - ResponseLiveness, - ResponseReadyness) + LivenessResponse, + ReadinessResponse) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a974b8d4cba0..f375ce23215d 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -721,7 +721,7 @@ class DetokenizeRequest(OpenAIBaseModel): class DetokenizeResponse(OpenAIBaseModel): prompt: str -class ResponseLiveness(OpenAIBaseModel): +class LivenessResponse(OpenAIBaseModel): """Return object for liveness probe""" alive: str = Field(None, title="Alive message") @@ -735,7 +735,7 @@ class ResponseLiveness(OpenAIBaseModel): } } -class ResponseReadiness(OpenAIBaseModel): +class ReadinessResponse(OpenAIBaseModel): """Return object for readiness probe""" ready: str = Field(None, title="Ready message") From 0c7945dd95a8a2dcf8240c97d4a4a7515dd30ce6 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:09:22 +0200 Subject: [PATCH 03/18] update naming for pydantic classes from openai protocol and remove await in variable definition --- vllm/entrypoints/openai/api_server.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 875049e362ff..6abe7d038ea7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -93,30 +93,30 @@ async def health() -> Response: @router.get( "/liveness", - response_model=ResponseLiveness, + response_model=LivenessResponse, name="liveness", tags=["technical"], ) -async def get_liveness() -> ResponseLiveness: +async def get_liveness() -> LivenessResponse: """Liveness probe for k8s""" - liveness_msg = ResponseLiveness(alive="ok") + liveness_msg = LivenessResponse(alive="ok") return liveness_msg @router.get( "/readiness", - response_model=ResponseReadiness, + response_model=ReadinessResponse, name="readiness", tags=["technical"], ) -async def get_readiness() -> ResponseReadiness: +async def get_readiness() -> ReadinessResponse: """Readiness probe for k8s""" - model_weights = await openai_serving_chat.engine.engine.model_executor.driver_worker.model_runner.model_memory_usage + model_weights = openai_serving_chat.engine.engine.model_executor.driver_worker.model_runner.model_memory_usage - if model_weights: - return ResponseReadiness(ready="ok") + if model_weights > 0: + return ReadinessResponse(ready="ok") else: - return ResponseReadiness(ready="ko") + return ReadinessResponse(ready="ko") @router.post("/tokenize") From fa1c549f3b4faec9cfc2105f6c85f31d0bee41de Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:45:50 +0200 Subject: [PATCH 04/18] add tests for readiness and liveness endpoints --- tests/entrypoints/openai/test_basic.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 2c721d9ba760..8568f37fdba1 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -59,3 +59,27 @@ async def test_log_metrics(client: openai.AsyncOpenAI): response = requests.get(base_url + "/metrics") assert response.status_code == HTTPStatus.OK + +@pytest.mark.asyncio +def test_get_liveness(client: openai.AsyncOpenAI): + """Test the technical route /liveness""" + base_url = str(client.base_url)[:-3].strip("/") + response = requests.get(base_url + "/liveness") + assert response.status_code == HTTPStatus.OK + assert response.json() == {"alive": "ok"} + +@pytest.mark.asyncio +def test_get_readiness_ko(client: openai.AsyncOpenAI): + """Test the technical route /readiness when the model is not loaded""" + base_url = str(client.base_url)[:-3].strip("/") + response = requests.get(base_url + "/readiness") + assert response.status_code == HTTPStatus.OK + assert response.json() == {"ready": "ko"} + +@pytest.mark.asyncio +def test_get_readiness_ok(client: openai.AsyncOpenAI): + """Test the technical route /readiness when the model is fully loaded""" + base_url = str(client.base_url)[:-3].strip("/") + response = requests.get(base_url + "/readiness") + assert response.status_code == HTTPStatus.OK + assert response.json() == {"ready": "ok"} From 2fbaa2fe38faf64413f06dedc030675b62f47f33 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:50:39 +0200 Subject: [PATCH 05/18] correct syntax pydantic class in protocol --- vllm/entrypoints/openai/protocol.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f375ce23215d..99a8dd7292b4 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -727,8 +727,7 @@ class LivenessResponse(OpenAIBaseModel): alive: str = Field(None, title="Alive message") model_config = { "json_schema_extra": { - "examples": [ - "liveness": { + "examples": [{ "alive": "ok" } ] @@ -741,8 +740,7 @@ class ReadinessResponse(OpenAIBaseModel): ready: str = Field(None, title="Ready message") model_config = { "json_schema_extra": { - "examples": [ - "readiness": { + "examples": [{ "ready": "ok" } ] From 27ef5ac4ce28ff27c22deea89028c12662afdc61 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:56:16 +0200 Subject: [PATCH 06/18] correct ruff errors --- vllm/entrypoints/openai/api_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6abe7d038ea7..21ebf2339926 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -111,7 +111,8 @@ async def get_liveness() -> LivenessResponse: ) async def get_readiness() -> ReadinessResponse: """Readiness probe for k8s""" - model_weights = openai_serving_chat.engine.engine.model_executor.driver_worker.model_runner.model_memory_usage + driver_worker = openai_serving_chat.engine.engine.model_executor.driver_worker + model_weights = driver_worker.model_runner.model_memory_usage if model_weights > 0: return ReadinessResponse(ready="ok") From 7fa6a37786ccfe593d55a9ba6fd5dc4c73c4c408 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:59:31 +0200 Subject: [PATCH 07/18] correct ruff errors --- vllm/entrypoints/openai/api_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 21ebf2339926..ba3ceb362ae2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -111,8 +111,8 @@ async def get_liveness() -> LivenessResponse: ) async def get_readiness() -> ReadinessResponse: """Readiness probe for k8s""" - driver_worker = openai_serving_chat.engine.engine.model_executor.driver_worker - model_weights = driver_worker.model_runner.model_memory_usage + d_worker = openai_serving_chat.engine.engine.model_executor.driver_worker + model_weights = d_worker.model_runner.model_memory_usage if model_weights > 0: return ReadinessResponse(ready="ok") From 18a9f2ccc094cb61320f5aea1b317176812b475f Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:03:25 +0200 Subject: [PATCH 08/18] fixing isort issues --- vllm/entrypoints/openai/api_server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ba3ceb362ae2..874e7ad7a692 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -29,10 +29,10 @@ DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, ErrorResponse, - TokenizeRequest, - TokenizeResponse, LivenessResponse, - ReadinessResponse) + ReadinessResponse, + TokenizeRequest, + TokenizeResponse) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion From 32e030b956b4db61dd6ed8446d56ed38e2bd24b7 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:23:49 +0200 Subject: [PATCH 09/18] update some typo --- tests/entrypoints/openai/test_basic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 8568f37fdba1..f99dabcd7be3 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -64,7 +64,9 @@ async def test_log_metrics(client: openai.AsyncOpenAI): def test_get_liveness(client: openai.AsyncOpenAI): """Test the technical route /liveness""" base_url = str(client.base_url)[:-3].strip("/") + response = requests.get(base_url + "/liveness") + assert response.status_code == HTTPStatus.OK assert response.json() == {"alive": "ok"} @@ -72,7 +74,9 @@ def test_get_liveness(client: openai.AsyncOpenAI): def test_get_readiness_ko(client: openai.AsyncOpenAI): """Test the technical route /readiness when the model is not loaded""" base_url = str(client.base_url)[:-3].strip("/") + response = requests.get(base_url + "/readiness") + assert response.status_code == HTTPStatus.OK assert response.json() == {"ready": "ko"} @@ -80,6 +84,8 @@ def test_get_readiness_ko(client: openai.AsyncOpenAI): def test_get_readiness_ok(client: openai.AsyncOpenAI): """Test the technical route /readiness when the model is fully loaded""" base_url = str(client.base_url)[:-3].strip("/") + response = requests.get(base_url + "/readiness") + assert response.status_code == HTTPStatus.OK assert response.json() == {"ready": "ok"} From 5127e9162e3c9b00676103cb9d212a4357ac6fb7 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:30:39 +0200 Subject: [PATCH 10/18] correct some yapf errors --- tests/entrypoints/openai/test_basic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index f99dabcd7be3..380221c88b01 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -60,6 +60,7 @@ async def test_log_metrics(client: openai.AsyncOpenAI): assert response.status_code == HTTPStatus.OK + @pytest.mark.asyncio def test_get_liveness(client: openai.AsyncOpenAI): """Test the technical route /liveness""" @@ -70,6 +71,7 @@ def test_get_liveness(client: openai.AsyncOpenAI): assert response.status_code == HTTPStatus.OK assert response.json() == {"alive": "ok"} + @pytest.mark.asyncio def test_get_readiness_ko(client: openai.AsyncOpenAI): """Test the technical route /readiness when the model is not loaded""" @@ -80,12 +82,13 @@ def test_get_readiness_ko(client: openai.AsyncOpenAI): assert response.status_code == HTTPStatus.OK assert response.json() == {"ready": "ko"} + @pytest.mark.asyncio def test_get_readiness_ok(client: openai.AsyncOpenAI): """Test the technical route /readiness when the model is fully loaded""" base_url = str(client.base_url)[:-3].strip("/") response = requests.get(base_url + "/readiness") - + assert response.status_code == HTTPStatus.OK assert response.json() == {"ready": "ok"} From c698d7654d94a4e9fe2b856a84fa25e71eae784f Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:34:48 +0200 Subject: [PATCH 11/18] correct readiness probe regarding its http status --- tests/entrypoints/openai/test_basic.py | 11 ----------- vllm/entrypoints/openai/api_server.py | 2 -- 2 files changed, 13 deletions(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 380221c88b01..1a8f6356cc57 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -72,17 +72,6 @@ def test_get_liveness(client: openai.AsyncOpenAI): assert response.json() == {"alive": "ok"} -@pytest.mark.asyncio -def test_get_readiness_ko(client: openai.AsyncOpenAI): - """Test the technical route /readiness when the model is not loaded""" - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/readiness") - - assert response.status_code == HTTPStatus.OK - assert response.json() == {"ready": "ko"} - - @pytest.mark.asyncio def test_get_readiness_ok(client: openai.AsyncOpenAI): """Test the technical route /readiness when the model is fully loaded""" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 874e7ad7a692..5f741b39ac72 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -116,8 +116,6 @@ async def get_readiness() -> ReadinessResponse: if model_weights > 0: return ReadinessResponse(ready="ok") - else: - return ReadinessResponse(ready="ko") @router.post("/tokenize") From ea8be8096105f68ed1513c776f0d4d7dd2730977 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:45:59 +0200 Subject: [PATCH 12/18] replace liveness endpoint by health endpoint and renaming readiness endpoint --- tests/entrypoints/openai/test_basic.py | 15 ++------------- vllm/entrypoints/openai/api_server.py | 13 +------------ vllm/entrypoints/openai/protocol.py | 14 +------------- 3 files changed, 4 insertions(+), 38 deletions(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 1a8f6356cc57..11b46322deca 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -62,22 +62,11 @@ async def test_log_metrics(client: openai.AsyncOpenAI): @pytest.mark.asyncio -def test_get_liveness(client: openai.AsyncOpenAI): - """Test the technical route /liveness""" - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/liveness") - - assert response.status_code == HTTPStatus.OK - assert response.json() == {"alive": "ok"} - - -@pytest.mark.asyncio -def test_get_readiness_ok(client: openai.AsyncOpenAI): +async def test_get_readiness_ok(client: openai.AsyncOpenAI): """Test the technical route /readiness when the model is fully loaded""" base_url = str(client.base_url)[:-3].strip("/") - response = requests.get(base_url + "/readiness") + response = requests.get(base_url + "/ready") assert response.status_code == HTTPStatus.OK assert response.json() == {"ready": "ok"} diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5f741b39ac72..abfc17cc5194 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -91,20 +91,9 @@ async def health() -> Response: await openai_serving_chat.engine.check_health() return Response(status_code=200) -@router.get( - "/liveness", - response_model=LivenessResponse, - name="liveness", - tags=["technical"], -) -async def get_liveness() -> LivenessResponse: - """Liveness probe for k8s""" - liveness_msg = LivenessResponse(alive="ok") - return liveness_msg - @router.get( - "/readiness", + "/ready", response_model=ReadinessResponse, name="readiness", tags=["technical"], diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 99a8dd7292b4..dbd92f42ed8a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -720,19 +720,7 @@ class DetokenizeRequest(OpenAIBaseModel): class DetokenizeResponse(OpenAIBaseModel): prompt: str - -class LivenessResponse(OpenAIBaseModel): - """Return object for liveness probe""" - - alive: str = Field(None, title="Alive message") - model_config = { - "json_schema_extra": { - "examples": [{ - "alive": "ok" - } - ] - } - } + class ReadinessResponse(OpenAIBaseModel): """Return object for readiness probe""" From c0baaead8920393f9f1a1127ccc22a07f6623976 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:57:11 +0200 Subject: [PATCH 13/18] clean some imports and configure error response for readiness endpoint --- tests/entrypoints/openai/test_basic.py | 1 - vllm/entrypoints/openai/api_server.py | 6 ++---- vllm/entrypoints/openai/protocol.py | 16 +--------------- 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 11b46322deca..e775714b7167 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -69,4 +69,3 @@ async def test_get_readiness_ok(client: openai.AsyncOpenAI): response = requests.get(base_url + "/ready") assert response.status_code == HTTPStatus.OK - assert response.json() == {"ready": "ok"} diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index abfc17cc5194..2a76f69570ac 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -29,8 +29,6 @@ DetokenizeRequest, DetokenizeResponse, EmbeddingRequest, ErrorResponse, - LivenessResponse, - ReadinessResponse, TokenizeRequest, TokenizeResponse) # yapf: enable @@ -98,13 +96,13 @@ async def health() -> Response: name="readiness", tags=["technical"], ) -async def get_readiness() -> ReadinessResponse: +async def get_readiness() -> Response: """Readiness probe for k8s""" d_worker = openai_serving_chat.engine.engine.model_executor.driver_worker model_weights = d_worker.model_runner.model_memory_usage if model_weights > 0: - return ReadinessResponse(ready="ok") + return Response(status_code=200) @router.post("/tokenize") diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index dbd92f42ed8a..c7ede0c334b2 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -719,18 +719,4 @@ class DetokenizeRequest(OpenAIBaseModel): class DetokenizeResponse(OpenAIBaseModel): - prompt: str - - -class ReadinessResponse(OpenAIBaseModel): - """Return object for readiness probe""" - - ready: str = Field(None, title="Ready message") - model_config = { - "json_schema_extra": { - "examples": [{ - "ready": "ok" - } - ] - } - } \ No newline at end of file + prompt: str \ No newline at end of file From 3a8b22740ae14d9b32a85f69238ade06114de2c5 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Mon, 5 Aug 2024 14:21:30 +0200 Subject: [PATCH 14/18] correct model response in readiness endpoint --- vllm/entrypoints/openai/api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2a76f69570ac..71dca2fb43f0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -92,7 +92,7 @@ async def health() -> Response: @router.get( "/ready", - response_model=ReadinessResponse, + response_model=Response, name="readiness", tags=["technical"], ) From ac095c115de8e618a9774c5847dd5ae03b6265fb Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Mon, 5 Aug 2024 15:15:06 +0200 Subject: [PATCH 15/18] add return response 500 for readiness if model weights not loaded --- tests/entrypoints/openai/test_basic.py | 9 +++++++++ vllm/entrypoints/openai/api_server.py | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index e775714b7167..a23c57ba9762 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -69,3 +69,12 @@ async def test_get_readiness_ok(client: openai.AsyncOpenAI): response = requests.get(base_url + "/ready") assert response.status_code == HTTPStatus.OK + +@pytest.mark.asyncio +async def test_get_readiness_ok(client: openai.AsyncOpenAI): + """Test the technical route /readiness when the model is fully loaded""" + base_url = str(client.base_url)[:-3].strip("/") + + response = requests.get(base_url + "/ready") + + assert response.status_code == HTTPStatus.OK diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 71dca2fb43f0..ab900924f42b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -92,7 +92,6 @@ async def health() -> Response: @router.get( "/ready", - response_model=Response, name="readiness", tags=["technical"], ) @@ -103,6 +102,8 @@ async def get_readiness() -> Response: if model_weights > 0: return Response(status_code=200) + else: + return Response(status_code=500) @router.post("/tokenize") From 14b2b9100404bcf1d046a9dd02471ad489e7bd4a Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Mon, 5 Aug 2024 16:08:54 +0200 Subject: [PATCH 16/18] Update test_basic.py Remove duplicate. --- tests/entrypoints/openai/test_basic.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index a23c57ba9762..e775714b7167 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -69,12 +69,3 @@ async def test_get_readiness_ok(client: openai.AsyncOpenAI): response = requests.get(base_url + "/ready") assert response.status_code == HTTPStatus.OK - -@pytest.mark.asyncio -async def test_get_readiness_ok(client: openai.AsyncOpenAI): - """Test the technical route /readiness when the model is fully loaded""" - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/ready") - - assert response.status_code == HTTPStatus.OK From b06d68618877e12c29333639c616c3385ecf147a Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:52:45 +0200 Subject: [PATCH 17/18] update the readiness endpoint with a try clause --- vllm/entrypoints/openai/api_server.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ab900924f42b..cec702b704a7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -9,7 +9,7 @@ import fastapi import uvicorn -from fastapi import APIRouter, Request +from fastapi import APIRouter, Request, HTTPException from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -97,13 +97,13 @@ async def health() -> Response: ) async def get_readiness() -> Response: """Readiness probe for k8s""" - d_worker = openai_serving_chat.engine.engine.model_executor.driver_worker - model_weights = d_worker.model_runner.model_memory_usage + try : + d_worker = openai_serving_chat.engine.engine.model_executor.driver_worker + model_weights = d_worker.model_runner.model_memory_usage - if model_weights > 0: - return Response(status_code=200) - else: - return Response(status_code=500) + if model_weights > 0: + return Response(status_code=200) + except: HTTPException(status_code=500, detail="Model not loaded yet") @router.post("/tokenize") From e950836bea5d9067f62f363a5a7885c028fd2c7e Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Thu, 8 Aug 2024 11:23:46 +0200 Subject: [PATCH 18/18] add check if KV cache has been set up in readiness endpoint --- vllm/entrypoints/openai/api_server.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index cec702b704a7..e1294f7a7536 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -98,12 +98,19 @@ async def health() -> Response: async def get_readiness() -> Response: """Readiness probe for k8s""" try : - d_worker = openai_serving_chat.engine.engine.model_executor.driver_worker - model_weights = d_worker.model_runner.model_memory_usage + model_executor = openai_serving_chat.engine.engine.model_executor + model_runner = model_executor.driver_worker.model_runner + + # check if model weight are loaded in gpu memory + model_weights = model_runner.model_memory_usage - if model_weights > 0: + # check if KV cache has been set up + num_cpu_blocks = model_runner.num_cpu_blocks + num_gpu_blocks = model_runner.num_gpu_blocks + + if model_weights > 0 and num_cpu_blocks > 0 and num_gpu_blocks > 0 : return Response(status_code=200) - except: HTTPException(status_code=500, detail="Model not loaded yet") + except: HTTPException(status_code=500, detail="Model not loaded yet or KV cache not setup yet") @router.post("/tokenize")