Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Inference]Support vllm testing in benchmark scripts #5379

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions colossalai/inference/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def generate(
self,
prompts: List[str] = None,
prompts_token_ids: Union[List[int], torch.Tensor, np.ndarray] = None,
return_token_ids: bool = False,
generation_config: Optional[GenerationConfig] = None,
) -> List[str]:
"""
Expand All @@ -147,6 +148,7 @@ def generate(
Args:
prompts (Union[List[str], optional): Input prompts. Defaults to None.
prompts_token_ids (List[List[int]], optional): token ids of input prompts. Defaults to None.
return_token_ids: Whether to return output token ids. Defaults to False.
FrankLeeeee marked this conversation as resolved.
Show resolved Hide resolved
generation_config (GenerationConfig, optional): Huggingface GenerationConfig used for inference. Defaults to None.

Returns:
Expand All @@ -158,7 +160,7 @@ def generate(
self.add_request(prompts=prompts, prompts_token_ids=prompts_token_ids)

output_seqs_list = []
output_tokens_list = []
total_tokens_list = []

# intuition: If user provide a generation config, we should replace the existing one.
if generation_config is not None:
Expand All @@ -170,11 +172,15 @@ def generate(
output_seqs_list = sorted(output_seqs_list, key=lambda x: int(x.request_id))

for seq in output_seqs_list:
output_tokens_list.append(seq.input_token_id + seq.output_token_id)
total_tokens_list.append(seq.input_token_id + seq.output_token_id)

output_str = self.tokenizer.batch_decode(output_tokens_list, skip_special_tokens=True)
output_str = self.tokenizer.batch_decode(total_tokens_list, skip_special_tokens=True)

return output_str
if return_token_ids:
output_tokens_list = [seq.output_token_id for seq in output_seqs_list]
return output_str, output_tokens_list
else:
return output_str

@property
def has_prompt_template(self) -> bool:
Expand Down
64 changes: 54 additions & 10 deletions examples/inference/benchmark_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch.distributed as dist
import transformers
from transformers import AutoTokenizer, GenerationConfig
from vllm import LLM, SamplingParams

import colossalai
from colossalai.accelerator import get_accelerator
Expand Down Expand Up @@ -58,12 +59,12 @@ def data_gen(batch_size: int = 4, seq_len: int = 512):
return input_ids


def print_details_info(model_config, args, whole_end2end):
def print_details_info(model_config, args, whole_end2end, total_token_num):
msg: str = ""

if dist.get_rank() == 0:
msg += "-------Perf Summary-------\n"
whole_avg_latency = whole_end2end / (args.output_len * args.batch_size)
whole_avg_latency = whole_end2end / (total_token_num)
num_layers = getattr(model_config, "num_layers", model_config.num_hidden_layers)
num_parameters = num_layers * model_config.hidden_size * model_config.hidden_size * 12 / args.pp_size
if args.dtype in ["fp16", "bf16"]:
Expand All @@ -73,7 +74,7 @@ def print_details_info(model_config, args, whole_end2end):

msg += f"Whole batch end2end time: {whole_end2end * 1000:.2f} ms\n"
msg += f"Whole batch per token latency: {whole_avg_latency * 1000:.2f} ms\n"
msg += f"Throughput: {args.output_len * args.batch_size / whole_end2end:.2f} tokens/s\n"
msg += f"Throughput: {total_token_num / whole_end2end:.2f} tokens/s\n"
msg += f"Flops: {num_parameters * num_bytes / whole_avg_latency / 1e12:.2f} TFLOPS\n"

if torch.cuda.is_available():
Expand All @@ -88,9 +89,15 @@ def benchmark_inference(args):
with torch.no_grad():
config = CONFIG_MAP[args.model]
config.pad_token_id = config.eos_token_id
model = transformers.LlamaForCausalLM(config).cuda()
if args.test_random_weight:
model = transformers.LlamaForCausalLM(config).cuda()
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
else:
assert args.model_path, "When testing true weight, the model path must be provided.'"
FrankLeeeee marked this conversation as resolved.
Show resolved Hide resolved
model = transformers.LlamaForCausalLM.from_pretrained(args.model_path).cuda()
tokenizer = AutoTokenizer.from_pretrained(args.model_path)

model = model.eval()
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")

if args.dtype == "fp16":
model = model.half()
Expand All @@ -109,12 +116,27 @@ def benchmark_inference(args):
max_input_len=args.seq_len,
max_output_len=args.output_len,
prefill_ratio=1.2,
block_size=32,
)
engine = InferenceEngine(model, tokenizer, inference_config, verbose=True)
elif args.mode == "vllm":
engine = LLM(
model="/home/caidi/llama_model/",
FrankLeeeee marked this conversation as resolved.
Show resolved Hide resolved
max_num_seqs=mbsz,
dtype="float16",
enforce_eager=True,
)

sampling_params = SamplingParams(
max_tokens=args.output_len,
)
else:
engine = model

data = data_gen(mbsz, args.seq_len)

data = data.tolist()

generation_config = GenerationConfig(
pad_token_id=tokenizer.pad_token_id,
max_new_tokens=args.output_len,
Expand All @@ -132,7 +154,7 @@ def benchmark_inference(args):
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(wait=0, warmup=N_WARMUP_STEPS, active=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler("./tb_log_" + args.mode),
on_trace_ready=torch.profiler.tensorboard_trace_handler(f"./tb_log_{args.batch_size}_" + args.mode),
)
if args.profile
else nullcontext()
Expand All @@ -142,6 +164,8 @@ def benchmark_inference(args):
for _ in range(N_WARMUP_STEPS):
if args.mode == "caiinference":
engine.generate(prompts_token_ids=data, generation_config=generation_config)
elif args.mode == "vllm":
engine.generate(prompt_token_ids=data, sampling_params=sampling_params)
else:
engine.generate(data, generation_config=generation_config)
if args.profile:
Expand All @@ -153,19 +177,35 @@ def benchmark_inference(args):
torch.cuda.synchronize()

whole_end2end = time.perf_counter()

if args.mode == "caiinference":
for _ in range(args.batch_size // mbsz):
engine.generate(prompts_token_ids=data, generation_config=generation_config)
output, output_tokens_list = engine.generate(
prompts_token_ids=data, generation_config=generation_config, return_token_ids=True
)
elif args.mode == "vllm":
for _ in range(args.batch_size // mbsz):
output = engine.generate(prompt_token_ids=data, sampling_params=sampling_params)
else:
for _ in range(args.batch_size // mbsz):
engine.generate(data, generation_config=generation_config)
output = engine.generate(data, generation_config=generation_config)

whole_end2end = time.perf_counter() - whole_end2end

if args.mode == "caiinference":
total_token_num = sum([len(output_tokens) for output_tokens in output_tokens_list])
elif args.mode == "vllm":
total_token_num = sum([len(out.outputs[0].token_ids) for out in output])
else:
total_token_num = sum([len(out) for out in output])

print("total_token_num: ", total_token_num)
if args.nsys:
torch.cuda.cudart().cudaProfilerStop()
if args.profile:
ctx.step()

print_details_info(model.config, args, whole_end2end)
print_details_info(model.config, args, whole_end2end, total_token_num)


def hybrid_inference(rank, world_size, port, args):
Expand All @@ -188,6 +228,7 @@ def benchmark(args):
help="the size of model",
choices=["toy", "llama-7b", "llama-13b", "llama2-7b", "llama2-13b"],
)
parser.add_argument("--model_path", type=str, default=None, help="The true weight path")
parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size")
parser.add_argument("--mbsz", type=int, default=8, help="batch size for one step")
parser.add_argument("-s", "--seq_len", type=int, default=8, help="input sequence length")
Expand All @@ -197,12 +238,15 @@ def benchmark(args):
parser.add_argument("--output_len", type=int, default=128, help="Output length")
parser.add_argument("--dtype", type=str, default="fp16", help="data type", choices=["fp16", "fp32", "bf16"])
parser.add_argument("-v", "--verbose", default=False, action="store_true")
parser.add_argument(
"--test_random_weight", default=False, action="store_true", help="whether to test random weight"
)
parser.add_argument("--profile", default=False, action="store_true", help="enable torch profiler")
parser.add_argument("--nsys", default=False, action="store_true", help="enable nsys profiler")
parser.add_argument(
"--mode",
default="caiinference",
choices=["caiinference", "transformers"],
choices=["caiinference", "transformers", "vllm"],
FrankLeeeee marked this conversation as resolved.
Show resolved Hide resolved
help="decide which inference framework to run",
)
parser.add_argument(
Expand Down
Loading