13
13
from pathlib import Path
14
14
from typing import List , Optional , Tuple
15
15
from transformers import AutoTokenizer
16
- from .common import instantiate_benchmark_results_dict , generate_synthetic_requests , warmup_vllm_engine , num_available_gpus
16
+ from .common import instantiate_benchmark_results_dict , generate_synthetic_requests , warmup_vllm_engine , num_available_gpus , print_request_outputs
17
17
from .datasets_registry import get_dataset , DatasetArgs
18
18
19
19
@@ -25,21 +25,21 @@ def get_tensor_parallel_size(args: argparse.Namespace) -> int:
25
25
return tensor_parallel_size
26
26
27
27
28
- def run_vllm (
29
- requests : List [ Tuple [ str , int , int ]] ,
30
- model : str ,
31
- tokenizer : str ,
32
- quantization : Optional [ str ] ,
33
- tensor_parallel_size : int ,
34
- seed : int ,
35
- n : int ,
36
- use_beam_search : bool ,
37
- trust_remote_code : bool ,
38
- dtype : str ,
39
- max_model_len : Optional [ int ] ,
40
- enforce_eager : bool ,
41
- sparsity : Optional [ str ] ,
42
- ) -> float :
28
+ def run_vllm (requests : List [ Tuple [ str , int , int ]],
29
+ model : str ,
30
+ tokenizer : str ,
31
+ quantization : Optional [ str ] ,
32
+ tensor_parallel_size : int ,
33
+ seed : int ,
34
+ n : int ,
35
+ use_beam_search : bool ,
36
+ trust_remote_code : bool ,
37
+ dtype : str ,
38
+ max_model_len : Optional [ int ] ,
39
+ enforce_eager : bool ,
40
+ sparsity : Optional [ str ] ,
41
+ num_warmup_prompts : int ,
42
+ log_model_io : bool = False ) -> float :
43
43
from vllm import LLM , SamplingParams
44
44
llm = LLM (
45
45
model = model ,
@@ -53,13 +53,15 @@ def run_vllm(
53
53
enforce_eager = enforce_eager ,
54
54
)
55
55
56
- warmup_vllm_engine (engine = llm , model = model , num_prompts = 1000 )
56
+ warmup_vllm_engine (engine = llm , model = model , num_prompts = num_warmup_prompts )
57
57
58
58
# Add the requests to the engine.
59
59
for prompt , _ , output_len in requests :
60
60
sampling_params = SamplingParams (
61
61
n = n ,
62
- temperature = 0.0 if use_beam_search else 1.0 ,
62
+ # TODO (varun) Make temperature configurable
63
+ #temperature=0.0 if use_beam_search else 1.0,
64
+ temperature = 0.0 ,
63
65
top_p = 1.0 ,
64
66
use_beam_search = use_beam_search ,
65
67
ignore_eos = True ,
@@ -74,9 +76,12 @@ def run_vllm(
74
76
75
77
start = time .perf_counter ()
76
78
# FIXME(woosuk): Do not use internal method.
77
- llm ._run_engine (use_tqdm = True )
79
+ outputs = llm ._run_engine (use_tqdm = True )
78
80
end = time .perf_counter ()
79
81
82
+ if log_model_io :
83
+ print_request_outputs (outputs )
84
+
80
85
return end - start
81
86
82
87
@@ -96,7 +101,7 @@ def main(args: argparse.Namespace):
96
101
num_samples = args .num_prompts ,
97
102
max_len = 2048 ,
98
103
seed = 42 ,
99
- ))
104
+ fixed_output_len = args . output_len ))
100
105
else :
101
106
# Make a synthetic dataset.
102
107
requests = generate_synthetic_requests (args .input_len , args .output_len ,
@@ -114,7 +119,9 @@ def main(args: argparse.Namespace):
114
119
args .dtype ,
115
120
args .max_model_len ,
116
121
args .enforce_eager ,
117
- sparsity = args .sparsity )
122
+ sparsity = args .sparsity ,
123
+ num_warmup_prompts = args .num_warmup_prompts ,
124
+ log_model_io = args .log_model_io )
118
125
119
126
total_prompt_tokens = sum (prompt_len for _ , prompt_len , _ in requests )
120
127
total_output_tokens = sum (output_len for _ , _ , output_len in requests )
@@ -189,10 +196,15 @@ def main(args: argparse.Namespace):
189
196
type = int ,
190
197
default = 1000 ,
191
198
help = "Number of prompts to process." )
199
+ parser .add_argument ("--num-warmup-prompts" ,
200
+ type = int ,
201
+ default = 1000 ,
202
+ help = "Number of prompts to do warmups with." )
192
203
parser .add_argument ("--seed" , type = int , default = 0 )
193
204
parser .add_argument ('--trust-remote-code' ,
194
205
action = 'store_true' ,
195
206
help = 'trust remote code from huggingface' )
207
+ parser .add_argument ("--log-model-io" , action = "store_true" )
196
208
parser .add_argument (
197
209
'--max-model-len' ,
198
210
type = int ,
0 commit comments