Skip to content

Commit

Permalink
[Core][Bugfix][Perf] Introduce MQLLMEngine to avoid asyncio OH (v…
Browse files Browse the repository at this point in the history
…llm-project#8157)

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
  • Loading branch information
5 people authored Sep 18, 2024
1 parent 9d104b5 commit 7c7714d
Show file tree
Hide file tree
Showing 36 changed files with 1,467 additions and 1,172 deletions.
4 changes: 3 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@ steps:
fast_check: true
source_file_dependencies:
- vllm/
- tests/mq_llm_engine
- tests/async_engine
- tests/test_inputs
- tests/multimodal
- tests/test_utils
- tests/worker
commands:
- pytest -v -s async_engine # Async Engine
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
Expand Down
4 changes: 2 additions & 2 deletions docs/source/dev/profiling/profiling_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ Traces can be visualized using https://ui.perfetto.dev/.
.. tip::

To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
``export VLLM_RPC_TIMEOUT=1800000``

Example commands and usage:
===========================
Expand Down
106 changes: 0 additions & 106 deletions tests/async_engine/test_openapi_server.py

This file was deleted.

120 changes: 0 additions & 120 deletions tests/entrypoints/openai/rpc/test_zmq_client.py

This file was deleted.

56 changes: 25 additions & 31 deletions tests/entrypoints/openai/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,38 +18,32 @@
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUE = 0.58
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
MORE_ARGS_LIST = [["--enable-chunked-prefill"], ["--num-scheduler-steps", "8"]]


@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len", "4096", "--enable-chunked-prefill",
"--disable-log-requests", "--enforce-eager"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
def server_data(server):
return {
"url": f"{server.url_for('v1')}/completions",
}
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy(more_args):
args = list(DEFAULT_ARGS)
args.extend(more_args)

print(f"Running with: {args}")

def test_lm_eval_accuracy(server_data):
model_args = (f"model={MODEL_NAME},"
f"base_url={server_data['url']},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")

results = lm_eval.simple_evaluate(
model="local-completions",
model_args=model_args,
tasks=TASK,
)

measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
url = f"{remote_server.url_for('v1')}/completions"

model_args = (
f"model={MODEL_NAME},"
f"base_url={url},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")

results = lm_eval.simple_evaluate(
model="local-completions",
model_args=model_args,
tasks=TASK,
)

measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.transformers_utils.tokenizer import get_tokenizer

from ..utils import VLLM_PATH
from ...utils import VLLM_PATH

chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
Expand Down
40 changes: 0 additions & 40 deletions tests/entrypoints/openai/test_mp_api_server.py

This file was deleted.

5 changes: 3 additions & 2 deletions tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from unittest.mock import MagicMock

from vllm.config import MultiModalConfig
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.transformers_utils.tokenizer import get_tokenizer
Expand Down Expand Up @@ -52,8 +52,9 @@ def test_async_serving_chat_init():


def test_serving_chat_should_set_correct_max_tokens():
mock_engine = MagicMock(spec=AsyncLLMEngine)
mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False

serving_chat = OpenAIServingChat(mock_engine,
MockModelConfig(),
Expand Down
Loading

0 comments on commit 7c7714d

Please sign in to comment.