From 806949514ab07a2d7218645022c22962696adf46 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 2 Aug 2024 10:03:24 -0700
Subject: [PATCH] [ci] set timeout for test_oot_registration.py (#7082)

---
 tests/entrypoints/openai/test_oot_registration.py | 4 ++++
 vllm/worker/worker.py                             | 4 +++-
 vllm/worker/xpu_worker.py                         | 4 +++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index dbbda6de1fa0..5272ac4065f1 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -36,10 +36,12 @@ def test_oot_registration_for_api_server():
     ctx = torch.multiprocessing.get_context()
     server = ctx.Process(target=server_function, args=(port, ))
     server.start()
+    MAX_SERVER_START_WAIT_S = 60
     client = OpenAI(
         base_url=f"http://localhost:{port}/v1",
         api_key="token-abc123",
     )
+    now = time.time()
     while True:
         try:
             completion = client.chat.completions.create(
@@ -57,6 +59,8 @@ def test_oot_registration_for_api_server():
         except OpenAIError as e:
             if "Connection error" in str(e):
                 time.sleep(3)
+                if time.time() - now > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError("Server did not start in time") from e
             else:
                 raise e
     server.kill()
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f3c379d1aa34..9e2cfff435cf 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -186,7 +186,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
         assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
         cache_block_size = self.get_cache_block_size_bytes()
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 6a822c2ba3e7..0f22d67c4f25 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -138,7 +138,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
         assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
         cache_block_size = self.get_cache_block_size_bytes()