[Core][Bugfix][Perf] Introduce MQLLMEngine to avoid asyncio OH (v…

…llm-project#8157) Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Co-authored-by: Simon Mo <simon.mo@hey.com>
hongxiayang · Sep 18, 2024 · 7c7714d · 7c7714d
1 parent 9d104b5
commit 7c7714d
Show file tree

Hide file tree

Showing 36 changed files with 1,467 additions and 1,172 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -43,13 +43,15 @@ steps:
   fast_check: true
   source_file_dependencies:
   - vllm/
+  - tests/mq_llm_engine
   - tests/async_engine
   - tests/test_inputs
   - tests/multimodal
   - tests/test_utils
   - tests/worker
   commands:
-  - pytest -v -s async_engine # Async Engine
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
   - pytest -v -s test_inputs.py
   - pytest -v -s multimodal

diff --git a/docs/source/dev/profiling/profiling_index.rst b/docs/source/dev/profiling/profiling_index.rst
@@ -21,8 +21,8 @@ Traces can be visualized using https://ui.perfetto.dev/.
 .. tip::
 
    To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
-   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
+   Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+   ``export VLLM_RPC_TIMEOUT=1800000``
 
 Example commands and usage:
 ===========================

diff --git a/tests/async_engine/test_openapi_server.py b/tests/async_engine/test_openapi_server.py
diff --git a/tests/entrypoints/openai/rpc/test_zmq_client.py b/tests/entrypoints/openai/rpc/test_zmq_client.py
diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py
@@ -18,38 +18,32 @@
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.58
+DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
+MORE_ARGS_LIST = [["--enable-chunked-prefill"], ["--num-scheduler-steps", "8"]]
 
 
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        "--max-model-len", "4096", "--enable-chunked-prefill",
-        "--disable-log-requests", "--enforce-eager"
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest.fixture(scope="module")
-def server_data(server):
-    return {
-        "url": f"{server.url_for('v1')}/completions",
-    }
+@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
+def test_lm_eval_accuracy(more_args):
+    args = list(DEFAULT_ARGS)
+    args.extend(more_args)
 
+    print(f"Running with: {args}")
 
-def test_lm_eval_accuracy(server_data):
-    model_args = (f"model={MODEL_NAME},"
-                  f"base_url={server_data['url']},"
-                  f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
-
-    results = lm_eval.simple_evaluate(
-        model="local-completions",
-        model_args=model_args,
-        tasks=TASK,
-    )
-
-    measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+
+        model_args = (
+            f"model={MODEL_NAME},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+
+        measured_value = results["results"][TASK][FILTER]
+        assert (measured_value - RTOL < EXPECTED_VALUE
+                and measured_value + RTOL > EXPECTED_VALUE
+                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/tests/async_engine/test_chat_template.py → .../entrypoints/openai/test_chat_template.py b/tests/async_engine/test_chat_template.py → .../entrypoints/openai/test_chat_template.py
@@ -5,7 +5,7 @@
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ..utils import VLLM_PATH
+from ...utils import VLLM_PATH
 
 chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()

diff --git a/tests/entrypoints/openai/test_mp_api_server.py b/tests/entrypoints/openai/test_mp_api_server.py
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
@@ -4,7 +4,7 @@
 from unittest.mock import MagicMock
 
 from vllm.config import MultiModalConfig
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -52,8 +52,9 @@ def test_async_serving_chat_init():
 
 
 def test_serving_chat_should_set_correct_max_tokens():
-    mock_engine = MagicMock(spec=AsyncLLMEngine)
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
 
     serving_chat = OpenAIServingChat(mock_engine,
                                      MockModelConfig(),