diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md new file mode 100644 index 0000000000000..1e33793842bf8 --- /dev/null +++ b/.buildkite/nightly-benchmarks/nightly-annotation.md @@ -0,0 +1,28 @@ + +## Description + +This file contains the downloading link for benchmarking results. + +- [benchmarking pipeline](artifact://nightly-pipeline.yaml) +- [benchmarking results](artifact://results.zip) +- [benchmarking code](artifact://nightly-benchmarks.zip) + +Please download the visualization scripts in the post + + +## Results reproduction + +- Find the docker we use in `benchmarking pipeline` +- Deploy the docker, and inside the docker: + - Download `nightly-benchmarks.zip`. + - In the same folder, run the following code +``` +export HF_TOKEN= +apt update +apt install -y git +unzip nightly-benchmarks.zip +VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +``` + +And the results will be inside `./benchmarks/results`. + diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index c3d3cbf473968..7dec7a0fe0b4e 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -1,45 +1,39 @@ # Nightly benchmark -The main goal of this benchmarking is two-fold: -- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload. -- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). - - -## Docker images - -We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images: -- vllm/vllm-openai:v0.5.0.post1 -- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 -- openmmlab/lmdeploy:v0.5.0 -- ghcr.io/huggingface/text-generation-inference:2.1 - - - - -## Hardware - -One AWS node with 8x NVIDIA A100 GPUs. - - -## Workload description - -We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - -- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed). -- Output length: the corresponding output length of these 500 prompts. -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. -- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - - - -## Plots - -In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed. - -Benchmarking results - -## Results - -{nightly_results_benchmarking_table} +This benchmark aims to: +- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. +- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. + +Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. + +Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) + + +## Setup + +- Docker images: + - vLLM: `vllm/vllm-openai:v0.6.2` + - SGLang: `lmsysorg/sglang:v0.3.2-cu121` + - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` + - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` + - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* + - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. +- Hardware + - 8x Nvidia A100 GPUs +- Workload: + - Dataset + - ShareGPT dataset + - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) + - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) + - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. + - Models: llama-3 8B, llama-3 70B. + - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). + - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. + - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. + - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). + +# Known issues + +- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). +- TGI does not support `ignore-eos` flag. \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 6e399bb936fbc..199517e8b067c 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec common_container_settings: &common_container_settings command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh resources: limits: nvidia.com/gpu: 8 @@ -37,7 +37,10 @@ common_container_settings: &common_container_settings steps: - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - label: "A100 trt benchmark" + + + + - label: "A100 vllm step 10" priority: 100 agents: queue: A100 @@ -46,7 +49,21 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + - image: vllm/vllm-openai:v0.6.2 + <<: *common_container_settings + + + + - label: "A100 sglang benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: lmsysorg/sglang:v0.3.2-cu121 <<: *common_container_settings - label: "A100 lmdeploy benchmark" @@ -58,11 +75,13 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: openmmlab/lmdeploy:v0.5.0 + - image: openmmlab/lmdeploy:v0.6.1-cu12 <<: *common_container_settings - - - label: "A100 vllm benchmark" + + + + - label: "A100 trt llama-8B" priority: 100 agents: queue: A100 @@ -71,10 +90,25 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: vllm/vllm-openai:latest + - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 <<: *common_container_settings + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: TEST_SELECTOR + value: "llama8B" - - label: "A100 tgi benchmark" + + - label: "A100 trt llama-70B" priority: 100 agents: queue: A100 @@ -83,12 +117,54 @@ steps: podSpec: <<: *common_pod_spec containers: - - image: ghcr.io/huggingface/text-generation-inference:2.1 + - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 <<: *common_container_settings + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: TEST_SELECTOR + value: "llama70B" + + + # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image + # - label: "A100 trt benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # <<: *common_pod_spec + # containers: + # - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 + # <<: *common_container_settings + + + # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. + # - label: "A100 tgi benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # <<: *common_pod_spec + # containers: + # - image: ghcr.io/huggingface/text-generation-inference:2.2.0 + # <<: *common_container_settings - wait - - label: "Plot" + - label: "Collect the results" priority: 100 agents: queue: A100 @@ -117,4 +193,4 @@ steps: name: hf-token-secret key: token - - wait \ No newline at end of file + - block: ":rocket: check the results!" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh deleted file mode 100644 index 627a3e6971578..0000000000000 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -x - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -check_hf_token() { - # check if HF_TOKEN is available and valid - if [[ -z "$HF_TOKEN" ]]; then - echo "Error: HF_TOKEN is not set." - exit 1 - elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then - echo "Error: HF_TOKEN does not start with 'hf_'." - exit 1 - else - echo "HF_TOKEN is set and valid." - fi -} - -main() { - - check_gpus - check_hf_token - - df -h - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get update && apt-get -y install jq) - - cd $VLLM_SOURCE_CODE_LOC/benchmarks - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - - # run lmdeploy - if which lmdeploy >/dev/null; then - echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh - exit 0 - fi - - # run tgi - if [ -e /tgi-entrypoint.sh ]; then - echo "tgi is available, redirect to run-tgi-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh - exit 0 - fi - - # run trt - if which trtllm-build >/dev/null; then - echo "trtllm is available, redirect to run-trt-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh - exit 0 - fi - - # run vllm - if [ -e /vllm-workspace ]; then - echo "vllm is available, redirect to run-vllm-nightly.sh" - bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh - exit 0 - fi - -} - -main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py new file mode 100644 index 0000000000000..6059588fe7277 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -0,0 +1,95 @@ +import argparse +import json +from pathlib import Path + +import numpy as np +import pandas as pd +from tabulate import tabulate + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description= + 'Parse command line arguments for summary-nightly-results script.') + parser.add_argument('--results-folder', + type=str, + required=True, + help='The folder where the results are stored.') + parser.add_argument('--description', + type=str, + required=True, + help='Description of the results.') + + args = parser.parse_args() + return args + + +def get_perf(df, method, model, metric): + + means = [] + + for qps in [2, 4, 8, 16, "inf"]: + target = df['Test name'].str.contains(model) + target = target & df['Engine'].str.contains(method) + target = target & df['Test name'].str.contains("qps_" + str(qps)) + filtered_df = df[target] + + if filtered_df.empty: + means.append(0.) + else: + means.append(filtered_df[metric].values[0]) + + return np.array(means) + + +def get_perf_w_std(df, method, model, metric): + + if metric in ["TTFT", "ITL"]: + mean = get_perf(df, method, model, "Mean " + metric + " (ms)") + mean = mean.tolist() + std = get_perf(df, method, model, "Std " + metric + " (ms)") + if std.mean() == 0: + std = None + success = get_perf(df, method, model, "Successful req.") + if std is not None: + std = std / np.sqrt(success) + std = std.tolist() + + else: + assert metric == "Tput" + mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( + df, method, model, "Output Tput (tok/s)") + mean = mean.tolist() + std = None + + return mean, std + + +def main(args): + results_folder = Path(args.results_folder) + + results = [] + + # collect results + for test_file in results_folder.glob("*_nightly_results.json"): + with open(test_file, "r") as f: + results = results + json.loads(f.read()) + + # generate markdown table + df = pd.DataFrame.from_dict(results) + + md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) + + with open(args.description, "r") as f: + description = f.read() + + description = description.format( + nightly_results_benchmarking_table=md_table) + + with open("nightly_results.md", "w") as f: + f.write(description) + + +if __name__ == '__main__': + args = parse_arguments() + main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh new file mode 100644 index 0000000000000..e9d7d6a8d760a --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -0,0 +1,241 @@ +#!/bin/bash + +# Currently FP8 benchmark is NOT enabled. + +set -x +server_params=$1 +common_params=$2 + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +launch_trt_server() { + + model_path=$(echo "$common_params" | jq -r '.model') + model_name="${model_path#*/}" + model_type=$(echo "$server_params" | jq -r '.model_type') + model_dtype=$(echo "$server_params" | jq -r '.model_dtype') + model_tp_size=$(echo "$common_params" | jq -r '.tp') + max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') + max_input_len=$(echo "$server_params" | jq -r '.max_input_len') + max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len') + max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens') + trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') + + # create model caching directory + cd ~ + rm -rf models + mkdir -p models + cd models + models_dir=$(pwd) + trt_model_path=${models_dir}/${model_name}-trt-ckpt + trt_engine_path=${models_dir}/${model_name}-trt-engine + + # clone tensorrt backend + cd / + rm -rf tensorrtllm_backend + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git + git lfs install + cd tensorrtllm_backend + git checkout $trt_llm_version + tensorrtllm_backend_dir=$(pwd) + git submodule update --init --recursive + + # build trtllm engine + cd /tensorrtllm_backend + cd ./tensorrt_llm/examples/${model_type} + python3 convert_checkpoint.py \ + --model_dir ${model_path} \ + --dtype ${model_dtype} \ + --tp_size ${model_tp_size} \ + --output_dir ${trt_model_path} + trtllm-build \ + --checkpoint_dir ${trt_model_path} \ + --use_fused_mlp \ + --reduce_fusion disable \ + --workers 8 \ + --gpt_attention_plugin ${model_dtype} \ + --gemm_plugin ${model_dtype} \ + --tp_size ${model_tp_size} \ + --max_batch_size ${max_batch_size} \ + --max_input_len ${max_input_len} \ + --max_seq_len ${max_seq_len} \ + --max_num_tokens ${max_num_tokens} \ + --output_dir ${trt_engine_path} + + # handle triton protobuf files and launch triton server + cd /tensorrtllm_backend + mkdir triton_model_repo + cp -r all_models/inflight_batcher_llm/* triton_model_repo/ + cd triton_model_repo + rm -rf ./tensorrt_llm/1/* + cp -r ${trt_engine_path}/* ./tensorrt_llm/1 + python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false + python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5 + python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false + python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size + python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1 + cd /tensorrtllm_backend + python3 scripts/launch_triton_server.py \ + --world_size=${model_tp_size} \ + --model_repo=/tensorrtllm_backend/triton_model_repo & + +} + +launch_tgi_server() { + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + --quantize fp8 \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + $server_args" + fi + + echo "Server command: $server_command" + eval "$server_command" & + +} + +launch_lmdeploy_server() { + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + server_command="lmdeploy serve api_server $model \ + --tp $tp \ + --server-port $port \ + $server_args" + + # run the server + echo "Server command: $server_command" + bash -c "$server_command" & +} + +launch_sglang_server() { + + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." + model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') + server_command="python3 \ + -m sglang.launch_server \ + --tp $tp \ + --model-path $model \ + --port $port \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="python3 \ + -m sglang.launch_server \ + --tp $tp \ + --model-path $model \ + --port $port \ + $server_args" + fi + + # run the server + echo "Server command: $server_command" + eval "$server_command" & +} + +launch_vllm_server() { + + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + server_args=$(json2args "$server_params") + + if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then + echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." + model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + $server_args" + fi + + # run the server + echo "Server command: $server_command" + eval "$server_command" & +} + +main() { + + if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then + launch_trt_server + fi + + if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then + launch_tgi_server + fi + + if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then + launch_lmdeploy_server + fi + + if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then + launch_sglang_server + fi + + if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then + launch_vllm_server + fi +} + +main diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh deleted file mode 100644 index f8262653a6628..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash - - -server_params=$1 -common_params=$2 - - - -model_path=$(echo "$common_params" | jq -r '.model') -model_name="${model_path#*/}" -model_type=$(echo "$server_params" | jq -r '.model_type') -model_dtype=$(echo "$server_params" | jq -r '.model_dtype') -model_tp_size=$(echo "$common_params" | jq -r '.tp') -max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') -max_input_len=$(echo "$server_params" | jq -r '.max_input_len') -max_output_len=$(echo "$server_params" | jq -r '.max_output_len') -trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') - -cd ~ -rm -rf models -mkdir -p models -cd models -models_dir=$(pwd) -trt_model_path=${models_dir}/${model_name}-trt-ckpt -trt_engine_path=${models_dir}/${model_name}-trt-engine - -cd ~ -rm -rf tensorrt-demo -git clone https://github.com/neuralmagic/tensorrt-demo.git -cd tensorrt-demo -tensorrt_demo_dir=$(pwd) - -# make sure the parameter inside tensorrt_demo is consistent to envvar -sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt -sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt - - -cd / -rm -rf tensorrtllm_backend -git clone https://github.com/triton-inference-server/tensorrtllm_backend.git -git lfs install -cd tensorrtllm_backend -git checkout $trt_llm_version -tensorrtllm_backend_dir=$(pwd) -git submodule update --init --recursive -cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ - -cd /tensorrtllm_backend -cd ./tensorrt_llm/examples/${model_type} - - -if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - - echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py" - echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md" - python ../quantization/quantize.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} \ - --qformat fp8 \ - --kv_cache_dtype fp8 \ - --calib_size 2 - -else - - echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py" - python3 convert_checkpoint.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} - -fi - - - -trtllm-build \ ---checkpoint_dir=${trt_model_path} \ ---gpt_attention_plugin=${model_dtype} \ ---gemm_plugin=${model_dtype} \ ---remove_input_padding=enable \ ---paged_kv_cache=enable \ ---tp_size=${model_tp_size} \ ---max_batch_size=${max_batch_size} \ ---max_input_len=${max_input_len} \ ---max_output_len=${max_output_len} \ ---max_num_tokens=${max_output_len} \ ---opt_num_tokens=${max_output_len} \ ---output_dir=${trt_engine_path} - -cd /tensorrtllm_backend/triton_model_repo -rm -rf ./tensorrt_llm/1/* -cp -r ${trt_engine_path}/* ./tensorrt_llm/1 -cd /tensorrtllm_backend -python3 scripts/launch_triton_server.py \ ---world_size=${model_tp_size} \ ---model_repo=/tensorrtllm_backend/triton_model_repo & \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 1168912c6e229..c6a1bbdeb7d48 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -8,6 +8,7 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) + (which zip) || (apt-get install -y zip) if [ ! -f /workspace/buildkite-agent ]; then echo "buildkite-agent binary not found. Skip plotting the results." @@ -24,17 +25,54 @@ main() { ls ls results/ - # generate figures - python3 -m pip install tabulate pandas matplotlib - python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - --description $description \ - --results-folder results/ + # upload benchmark results + zip -r results.zip results/ + /workspace/buildkite-agent artifact upload "results.zip" + + # upload benchmarking scripts + cd $VLLM_SOURCE_CODE_LOC/ + zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ + /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" + + cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + # upload benchmarking pipeline + /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" + + cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md + + + + # The figures should be genereated by a separate process outside the CI/CD pipeline + + # # generate figures + # python3 -m pip install tabulate pandas matplotlib + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ + # --description $description \ + # --results-folder results/ + + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sharegpt + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sonnet_2048_128 + + # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + # --description $description \ + # --results-folder results/ \ + # --dataset sonnet_128_2048 - # upload results and figures - /workspace/buildkite-agent artifact upload "nightly_results.png" - /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml - /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json - /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md + # # upload results and figures + # /workspace/buildkite-agent artifact upload "nightly_results*.png" + # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml + # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json + # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md } main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py deleted file mode 100644 index e5cfcc64a9b2a..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ /dev/null @@ -1,135 +0,0 @@ -import argparse -import json -import math -from pathlib import Path - -import matplotlib.pyplot as plt -import pandas as pd -from tabulate import tabulate - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description= - 'Parse command line arguments for summary-nightly-results script.') - parser.add_argument('--results-folder', - type=str, - required=True, - help='The folder where the results are stored.') - parser.add_argument('--description', - type=str, - required=True, - help='Description of the results.') - - args = parser.parse_args() - return args - - -def main(args): - bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00'] - results_folder = Path(args.results_folder) - - results = [] - - # collect results - for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file, "r") as f: - results = results + json.loads(f.read()) - - # generate markdown table - df = pd.DataFrame.from_dict(results) - - md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) - - with open(args.description, "r") as f: - description = f.read() - - description = description.format( - nightly_results_benchmarking_table=md_table) - - with open("nightly_results.md", "w") as f: - f.write(description) - - plt.rcParams.update({'font.size': 20}) - - # plot results - fig, axes = plt.subplots(3, 3, figsize=(16, 14)) - fig.subplots_adjust(hspace=1) - methods = ["vllm", "trt", "lmdeploy", "tgi"] - for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): - for j, metric in enumerate(["TTFT", "ITL"]): - means, stds = [], [] - for method in methods: - target = df['Test name'].str.contains(model) - target = target & df['Engine'].str.contains(method) - filtered_df = df[target] - - if filtered_df.empty: - means.append(0.) - stds.append(0.) - else: - means.append(filtered_df[f"Mean {metric} (ms)"].values[0]) - std = filtered_df[f"Std {metric} (ms)"].values[0] - success = filtered_df["Successful req."].values[0] - stds.append(std / math.sqrt(success)) - - print(model, metric) - print(means, stds) - - ax = axes[i, j + 1] - - bars = ax.bar( - ["vllm", "trt", "lmdeploy", "tgi"], - means, - yerr=stds, - capsize=10, - ) - for idx, bar in enumerate(bars): - bar.set_color(bar_colors[idx]) - ax.set_ylim(bottom=0) - - ax.set_ylabel(f"{metric} (ms)") - ax.set_title(f"{model} {metric}") - ax.grid(axis='y') - - metric = "Tput" - j = 0 - if True: - tputs = [] - for method in methods: - target = df['Test name'].str.contains(model) - target = target & df['Engine'].str.contains(method) - filtered_df = df[target] - - if filtered_df.empty: - tputs.append(0.) - else: - input_tput = filtered_df["Input Tput (tok/s)"].values[0] - output_tput = filtered_df["Output Tput (tok/s)"].values[0] - tputs.append(input_tput + output_tput) - - print(model, metric) - print(tputs) - - ax = axes[i, j] - - bars = ax.bar( - ["vllm", "trt", "lmdeploy", "tgi"], - tputs, - ) - for idx, bar in enumerate(bars): - bar.set_color(bar_colors[idx]) - - ax.set_ylim(bottom=0) - - ax.set_ylabel("Tput (token/s)") - ax.set_title(f"{model} {metric}") - ax.grid(axis='y') - - fig.tight_layout() - fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400) - - -if __name__ == '__main__': - args = parse_arguments() - main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh deleted file mode 100644 index d6f112aaa42fd..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ /dev/null @@ -1,218 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - pkill lmdeploy || true - # waiting for GPU processes to be fully killed - sleep 10 - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append lmdeploy to the test name - test_name=lmdeploy_$test_name - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') - client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') - server_args=$(json2args "$server_params") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - # prepare tokenizer - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - - server_command="lmdeploy serve api_server $model \ - --tp $tp \ - --server-port $port \ - $server_args" - - # run the server - echo "Running test case $test_name" - echo "Server command: $server_command" - bash -c "$server_command" & - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "lmdeploy server is up and running." - else - echo "" - echo "lmdeploy failed to start within the timeout period." - break - fi - - # get model name - model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend lmdeploy \ - --tokenizer /tokenizer_cache \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --model \"$model_name\" \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "lmdeploy" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -main() { - - check_gpus - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - python -m pip install transformers==4.41.2 - - export CURRENT_LLM_SERVING_ENGINE=lmdeploy - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - python -m pip install tabulate pandas - python $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh new file mode 100644 index 0000000000000..dd8c15e0700eb --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -0,0 +1,357 @@ +#!/bin/bash + +set -o pipefail +set -x + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + + +get_current_llm_serving_engine() { + + if which lmdeploy >/dev/null; then + echo "Container: lmdeploy" + export CURRENT_LLM_SERVING_ENGINE=lmdeploy + return + fi + + if [ -e /tgi-entrypoint.sh ]; then + echo "Container: tgi" + export CURRENT_LLM_SERVING_ENGINE=tgi + return + fi + + if which trtllm-build >/dev/null; then + echo "Container: tensorrt-llm" + export CURRENT_LLM_SERVING_ENGINE=trt + return + fi + + if [ -e /sgl-workspace ]; then + echo "Container: sglang" + export CURRENT_LLM_SERVING_ENGINE=sglang + return + fi + + if [ -e /vllm-workspace ]; then + echo "Container: vllm" + # move to a completely irrelevant directory, to avoid import vllm from current folder + export CURRENT_LLM_SERVING_ENGINE=vllm + + return + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +kill_gpu_processes() { + pkill -f python + pkill -f python3 + pkill -f tritonserver + pkill -f pt_main_thread + pkill -f text-generation + pkill -f lmdeploy + + while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + sleep 1 + done +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl -s localhost:8000/v1/completions > /dev/null; do + sleep 1 + done' && return 0 || return 1 +} + +ensure_installed() { + # Ensure that the given command is installed by apt-get + local cmd=$1 + if ! which $cmd >/dev/null; then + apt-get update && apt-get install -y $cmd + fi +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \ + "$server_params" "$common_params" + fi + + wait_for_server + + if [ $? -eq 0 ]; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # prepare tokenizer + # this is required for lmdeploy. + cd $VLLM_SOURCE_CODE_LOC/benchmarks + rm -rf /tokenizer_cache + mkdir /tokenizer_cache + python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ + --model "$model" \ + --cachedir /tokenizer_cache + cd $VLLM_SOURCE_CODE_LOC/benchmarks + + + # change model name for lmdeploy (it will not follow standard hf name) + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then + model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ $backend = "trt" ]]; then + backend="tensorrt-llm" + fi + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + + if [[ "$dataset_name" = "sharegpt" ]]; then + + client_command="python3 benchmark_serving.py \ + --backend $backend \ + --tokenizer /tokenizer_cache \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --port $port \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --ignore-eos \ + $client_args" + + elif [[ "$dataset_name" = "sonnet" ]]; then + + sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') + sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') + sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') + + client_command="python3 benchmark_serving.py \ + --backend $backend \ + --tokenizer /tokenizer_cache \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --sonnet-input-len $sonnet_input_len \ + --sonnet-output-len $sonnet_output_len \ + --sonnet-prefix-len $sonnet_prefix_len \ + --port $port \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --ignore-eos \ + $client_args" + + else + + echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." + exit 1 + + fi + + + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + server_command="None" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + engine: $engine + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + done + + kill_gpu_processes +} + + +prepare_dataset() { + + # download sharegpt dataset + cd $VLLM_SOURCE_CODE_LOC/benchmarks + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + # duplicate sonnet by 4x, to allow benchmarking with input length 2048 + cd $VLLM_SOURCE_CODE_LOC/benchmarks + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + +} + +main() { + + # check if the environment variable is successfully injected from yaml + + check_gpus + check_hf_token + get_current_llm_serving_engine + + pip install -U transformers + + # check storage + df -h + + ensure_installed wget + ensure_installed curl + ensure_installed jq + + prepare_dataset + + cd $VLLM_SOURCE_CODE_LOC/benchmarks + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + + # run the test + run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + + # upload benchmark results to buildkite + python3 -m pip install tabulate pandas + python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py + upload_to_buildkite + +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh deleted file mode 100644 index fed03654f8b77..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ /dev/null @@ -1,216 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - pkill text-generation || true - # waiting for GPU processes to be fully killed - sleep 10 - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - timeout 1200 bash -c ' - until curl -s localhost:8000/generate_stream > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append tgi to the test name - test_name=tgi_$test_name - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.tgi_server_parameters') - client_params=$(echo "$params" | jq -r '.tgi_client_parameters') - server_args=$(json2args "$server_params") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - echo "Key 'fp8' exists in common params." - server_command="/tgi-entrypoint.sh \ - --model-id $model \ - --num-shard $tp \ - --port $port \ - --quantize fp8 \ - $server_args" - else - echo "Key 'fp8' does not exist in common params." - server_command="/tgi-entrypoint.sh \ - --model-id $model \ - --num-shard $tp \ - --port $port \ - $server_args" - fi - - - - - # run the server - echo "Running test case $test_name" - echo "Server command: $server_command" - eval "$server_command" & - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "tgi server is up and running." - else - echo "" - echo "tgi failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend tgi \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "tgi" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - -main() { - - check_gpus - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - export CURRENT_LLM_SERVING_ENGINE=tgi - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - python -m pip install tabulate pandas - python $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh deleted file mode 100644 index 4a82b9ec64d71..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ /dev/null @@ -1,214 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - pkill tritonserver || true - # waiting for GPU processes to be fully killed - sleep 20 - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - timeout 1200 bash -c ' - until curl -s localhost:8000/generate_stream > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append trt to the test name - test_name=trt_$test_name - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.trt_server_parameters') - client_params=$(echo "$params" | jq -r '.trt_client_parameters') - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - - - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - - echo "Running test case $test_name" - bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params" - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "trt server is up and running." - else - echo "" - echo "trt failed to start within the timeout period." - break - fi - - # prepare tokenizer - cd $VLLM_SOURCE_CODE_LOC/benchmarks - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend tensorrt-llm \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - server_command="" - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "trt" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -main() { - - check_gpus - - - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - # update transformers package, to make sure mixtral tokenizer is available - python -m pip install transformers -U - - export CURRENT_LLM_SERVING_ENGINE=trt - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - python -m pip install tabulate pandas - python $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh deleted file mode 100644 index 663045b8a9122..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ /dev/null @@ -1,221 +0,0 @@ -#!/bin/bash - -set -o pipefail - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') - echo "GPU type is $gpu_type" -} - -kill_gpu_processes() { - # kill all processes on GPU. - pkill pt_main_thread - sleep 10 - - # remove vllm config file - rm -rf ~/.config/vllm - - # Print the GPU memory usage - # so that we know if all GPU processes are killed. - gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) - # The memory usage should be 0 MB. - echo "GPU 0 Memory Usage: $gpu_memory_usage MB" -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -run_serving_tests() { - # run serving tests using `benchmark_serving.py` - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # append vllm to the test name - test_name=vllm_$test_name - - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - - # get client and server arguments - server_params=$(echo "$params" | jq -r '.vllm_server_parameters') - client_params=$(echo "$params" | jq -r '.vllm_client_parameters') - server_args=$(json2args "$server_params") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." - model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ - -tp $tp \ - --model $model \ - --port $port \ - $server_args" - else - echo "Key 'fp8' does not exist in common params." - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ - -tp $tp \ - --model $model \ - --port $port \ - $server_args" - fi - - # run the server - echo "Running test case $test_name" - echo "Server command: $server_command" - eval "$server_command" & - - # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then - echo "" - echo "vllm server is up and running." - else - echo "" - echo "vllm failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - client_command="python3 benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "vllm" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - # clean up - kill_gpu_processes - rm -rf /root/.cache/huggingface/* - done -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - -main() { - - check_gpus - # enter vllm directory - cd $VLLM_SOURCE_CODE_LOC/benchmarks - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - - export CURRENT_LLM_SERVING_ENGINE=vllm - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - - python3 -m pip install tabulate pandas - python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 782d1ef9aab98..4e4d4cd4ca3c6 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -17,10 +17,17 @@ "request_throughput": "Tput (req/s)", "mean_ttft_ms": "Mean TTFT (ms)", "std_ttft_ms": "Std TTFT (ms)", + "median_ttft_ms": "Median TTFT (ms)", "mean_itl_ms": "Mean ITL (ms)", "std_itl_ms": "Std ITL (ms)", - "input_throughput": "Input Tput (tok/s)", + "median_itl_ms": "Median ITL (ms)", + "mean_tpot_ms": "Mean TPOT (ms)", + "std_tpot_ms": "Std TPOT (ms)", + "median_tpot_ms": "Median TPOT (ms)", + "total_token_throughput": "Total Token Tput (tok/s)", "output_throughput": "Output Tput (tok/s)", + "total_input_tokens": "Total input tokens", + "total_output_tokens": "Total output tokens", "engine": "Engine", } diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index f250833c62710..fda1a7a3ec53c 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,16 +1,18 @@ [ { - "test_name": "llama8B_tp1", - "qps_list": [4], + "test_name": "llama8B_tp1_sharegpt", + "qps_list": [4,8,16,32,"inf"], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3-8B-Instruct", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, - "port": 8000 + "port": 8000, + "reuse_server": false }, "lmdeploy_server_parameters": { + "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, @@ -21,34 +23,158 @@ }, "trt_server_parameters": { "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, + "model_dtype": "bfloat16", + "max_batch_size": 2048, "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "" + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" }, "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { } }, { - "test_name": "llama70B_tp4", - "qps_list": [2], + "test_name": "llama70B_tp4_sharegpt", + "qps_list": [4,8,16,32,"inf"], "common_parameters": { "model": "meta-llama/Meta-Llama-3-70B-Instruct", "tp": 4, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, - "port": 8000 + "port": 8000, + "reuse_server": false }, "lmdeploy_server_parameters": { + "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, @@ -59,34 +185,50 @@ }, "trt_server_parameters": { "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, + "model_dtype": "bfloat16", + "max_batch_size": 2048, "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" - }, + }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "" + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" }, "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { } }, { - "test_name": "mixtral8x7B_tp2", - "qps_list": [2], + "test_name": "llama70B_tp4_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], "common_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tp": 2, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", "num_prompts": 500, - "port": 8000 + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true }, "lmdeploy_server_parameters": { + "dtype": "bfloat16" }, "lmdeploy_client_parameters": { }, @@ -97,20 +239,85 @@ }, "trt_server_parameters": { "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, + "model_dtype": "bfloat16", + "max_batch_size": 2048, "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "" + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" }, "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { } } ] \ No newline at end of file diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 3def4a6d67acf..bcd38461617a8 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -26,6 +26,7 @@ class RequestFuncInput: use_beam_search: bool = False logprobs: Optional[int] = None multi_modal_content: Optional[dict] = None + ignore_eos: bool = False @dataclass @@ -55,6 +56,7 @@ async def async_request_tgi( "do_sample": True, "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. + # TGI does not accept ignore_eos flag. } payload = { "inputs": request_func_input.prompt, @@ -129,6 +131,8 @@ async def async_request_trt_llm( "max_tokens": request_func_input.output_len, "stream": True, } + if request_func_input.ignore_eos: + payload["min_length"] = request_func_input.output_len output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -240,6 +244,7 @@ async def async_request_openai_completions( "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, + "ignore_eos": request_func_input.ignore_eos, } headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" @@ -327,6 +332,7 @@ async def async_request_openai_chat_completions( "temperature": 0.0, "max_tokens": request_func_input.output_len, "stream": True, + "ignore_eos": request_func_input.ignore_eos, } headers = { "Content-Type": "application/json", @@ -430,4 +436,5 @@ def get_tokenizer( "openai-chat": async_request_openai_chat_completions, "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, + "sglang": async_request_openai_completions, } diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 56c37b241a359..0460f4c0094be 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -397,6 +397,7 @@ async def benchmark( profile: bool, selected_percentile_metrics: List[str], selected_percentiles: List[str], + ignore_eos: bool, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -420,6 +421,7 @@ async def benchmark( best_of=best_of, use_beam_search=use_beam_search, multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, ) test_output = await request_func(request_func_input=test_input) if not test_output.success: @@ -685,6 +687,7 @@ def main(args: argparse.Namespace): selected_percentiles=[ float(p) for p in args.metric_percentiles.split(",") ], + ignore_eos=args.ignore_eos, )) # Save config and results to json @@ -863,6 +866,11 @@ def main(args: argparse.Namespace): "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" " format.", ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") parser.add_argument( "--percentile-metrics", type=str,