Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit aebf20b

Browse files
varun-sundar-rabindranathVarun Sundar Rabindranath
and
Varun Sundar Rabindranath
authored
Benchmarking : Misc updates (#95)
SUMMARY: Fixes and Quality-of-life changes - Fix the vllm engine `temperature` to 0.0 so the text generation is deterministic - Fix time-per-output-token metric computation - Add num_warmup_prompts and log_model_io options to benchmark throughput TEST PLAN: Manual testing --------- Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
1 parent 3ae527f commit aebf20b

File tree

4 files changed

+64
-31
lines changed

4 files changed

+64
-31
lines changed

neuralmagic/benchmarks/scripts/backend_request_func.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ async def async_request_vllm(
111111
"n": 1,
112112
"best_of": request_func_input.best_of,
113113
"use_beam_search": request_func_input.use_beam_search,
114-
"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
114+
# TODO (varun) : Make temperature configurable
115+
#"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
116+
"temperature": 0.0,
115117
"top_p": 1.0,
116118
"max_tokens": request_func_input.output_len,
117119
"ignore_eos": True,

neuralmagic/benchmarks/scripts/benchmark_serving.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@
3333
from tqdm.asyncio import tqdm
3434
from transformers import PreTrainedTokenizerBase
3535
from vllm.transformers_utils.tokenizer import get_tokenizer
36-
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_benchmark_io
37-
# TODO (move this to scripts)
36+
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_serving_request_io
3837
from .datasets_registry import get_dataset, DatasetArgs
3938

4039
from neuralmagic.benchmarks.scripts.backend_request_func import (
@@ -100,7 +99,9 @@ def calculate_metrics(
10099
total_output += output_len
101100
total_input += input_requests[i][1]
102101
latencies.append(outputs[i].latency)
103-
tpots.append((outputs[i].latency - outputs[i].ttft) / output_len)
102+
if output_len > 1:
103+
tpots.append(
104+
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
104105
ttfts.append(outputs[i].ttft)
105106
completed += 1
106107

@@ -167,7 +168,7 @@ async def benchmark(backend: str, api_url: str, model_id: str,
167168

168169
# Dump model i/o
169170
if log_model_io:
170-
print_benchmark_io(outputs)
171+
print_serving_request_io(input_requests, outputs)
171172

172173
metrics = calculate_metrics(
173174
input_requests=input_requests,

neuralmagic/benchmarks/scripts/benchmark_throughput.py

+33-21
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pathlib import Path
1414
from typing import List, Optional, Tuple
1515
from transformers import AutoTokenizer
16-
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus
16+
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs
1717
from .datasets_registry import get_dataset, DatasetArgs
1818

1919

@@ -25,21 +25,21 @@ def get_tensor_parallel_size(args: argparse.Namespace) -> int:
2525
return tensor_parallel_size
2626

2727

28-
def run_vllm(
29-
requests: List[Tuple[str, int, int]],
30-
model: str,
31-
tokenizer: str,
32-
quantization: Optional[str],
33-
tensor_parallel_size: int,
34-
seed: int,
35-
n: int,
36-
use_beam_search: bool,
37-
trust_remote_code: bool,
38-
dtype: str,
39-
max_model_len: Optional[int],
40-
enforce_eager: bool,
41-
sparsity: Optional[str],
42-
) -> float:
28+
def run_vllm(requests: List[Tuple[str, int, int]],
29+
model: str,
30+
tokenizer: str,
31+
quantization: Optional[str],
32+
tensor_parallel_size: int,
33+
seed: int,
34+
n: int,
35+
use_beam_search: bool,
36+
trust_remote_code: bool,
37+
dtype: str,
38+
max_model_len: Optional[int],
39+
enforce_eager: bool,
40+
sparsity: Optional[str],
41+
num_warmup_prompts: int,
42+
log_model_io: bool = False) -> float:
4343
from vllm import LLM, SamplingParams
4444
llm = LLM(
4545
model=model,
@@ -53,13 +53,15 @@ def run_vllm(
5353
enforce_eager=enforce_eager,
5454
)
5555

56-
warmup_vllm_engine(engine=llm, model=model, num_prompts=1000)
56+
warmup_vllm_engine(engine=llm, model=model, num_prompts=num_warmup_prompts)
5757

5858
# Add the requests to the engine.
5959
for prompt, _, output_len in requests:
6060
sampling_params = SamplingParams(
6161
n=n,
62-
temperature=0.0 if use_beam_search else 1.0,
62+
# TODO (varun) Make temperature configurable
63+
#temperature=0.0 if use_beam_search else 1.0,
64+
temperature=0.0,
6365
top_p=1.0,
6466
use_beam_search=use_beam_search,
6567
ignore_eos=True,
@@ -74,9 +76,12 @@ def run_vllm(
7476

7577
start = time.perf_counter()
7678
# FIXME(woosuk): Do not use internal method.
77-
llm._run_engine(use_tqdm=True)
79+
outputs = llm._run_engine(use_tqdm=True)
7880
end = time.perf_counter()
7981

82+
if log_model_io:
83+
print_request_outputs(outputs)
84+
8085
return end - start
8186

8287

@@ -96,7 +101,7 @@ def main(args: argparse.Namespace):
96101
num_samples=args.num_prompts,
97102
max_len=2048,
98103
seed=42,
99-
))
104+
fixed_output_len=args.output_len))
100105
else:
101106
# Make a synthetic dataset.
102107
requests = generate_synthetic_requests(args.input_len, args.output_len,
@@ -114,7 +119,9 @@ def main(args: argparse.Namespace):
114119
args.dtype,
115120
args.max_model_len,
116121
args.enforce_eager,
117-
sparsity=args.sparsity)
122+
sparsity=args.sparsity,
123+
num_warmup_prompts=args.num_warmup_prompts,
124+
log_model_io=args.log_model_io)
118125

119126
total_prompt_tokens = sum(prompt_len for _, prompt_len, _ in requests)
120127
total_output_tokens = sum(output_len for _, _, output_len in requests)
@@ -189,10 +196,15 @@ def main(args: argparse.Namespace):
189196
type=int,
190197
default=1000,
191198
help="Number of prompts to process.")
199+
parser.add_argument("--num-warmup-prompts",
200+
type=int,
201+
default=1000,
202+
help="Number of prompts to do warmups with.")
192203
parser.add_argument("--seed", type=int, default=0)
193204
parser.add_argument('--trust-remote-code',
194205
action='store_true',
195206
help='trust remote code from huggingface')
207+
parser.add_argument("--log-model-io", action="store_true")
196208
parser.add_argument(
197209
'--max-model-len',
198210
type=int,

neuralmagic/benchmarks/scripts/common.py

+23-5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from vllm.outputs import RequestOutput
1313
from vllm.transformers_utils.tokenizer import get_tokenizer
1414
from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR
15-
from .backend_request_func import RequestFuncInput, async_request_vllm
15+
from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm
1616
from ...tools.call_cmd import call_cmd
1717

1818

@@ -204,9 +204,27 @@ def instantiate_benchmark_results_dict(benchmarking_script_name: str,
204204
return result_dict
205205

206206

207-
def print_benchmark_io(results: List[RequestOutput]) -> None:
207+
def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int,
208+
n_output_tokens: int) -> str:
209+
return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n"
210+
211+
212+
def print_request_outputs(results: List[RequestOutput]) -> None:
208213
for result in results:
209214
output = result.outputs[0]
210-
print(
211-
f"\n\n inputs({len(result.prompt_token_ids)}): {result.prompt}\n output({len(output.token_ids)}): {output.text}"
212-
)
215+
io_log = format_io_log(result.prompt, output.text,
216+
len(result.prompt_token_ids),
217+
len(output.token_ids))
218+
print(f"\n{io_log}")
219+
220+
221+
def print_serving_request_io(inputs: List[Tuple[str, int, int]],
222+
outputs: List[RequestFuncOutput]) -> None:
223+
"""
224+
inputs: list of tuples where the tuple is [prompt, prompt_length, output_length],
225+
outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py)
226+
Format and print the inputs and outputs.
227+
"""
228+
for i, o in zip(inputs, outputs):
229+
io_log = format_io_log(i[0], o.generated_text, i[1], i[2])
230+
print(f"\n{io_log}")

0 commit comments

Comments
 (0)