Skip to content

Commit

Permalink
Add readme generation to benchmarks (#1)
Browse files Browse the repository at this point in the history
* Added readme script

* Added the docstrings

* Added gpu_name and tests

* Added min-number-of-valid-queries
  • Loading branch information
gsolard authored Jul 3, 2024
1 parent 075a8a1 commit 5e5ac33
Show file tree
Hide file tree
Showing 13 changed files with 348 additions and 21 deletions.
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ PORT=8000
DATASET_FOLDER="datasets"
OUTPUT_FOLDER="results"

GPU_NAME="A100-80g"

STEP_LIVE_METRICS=0.01
MAX_QUERIES=1000

Expand All @@ -15,6 +17,8 @@ MAX_DURATION_SPEED_GENERATION=6000
MIN_DURATION_SPEED_GENERATION=300
TARGET_QUERIES_NB_SPEED_GENERATION=60

MIN_NUMBER_OF_VALID_QUERIES=50

BACKEND="happy_vllm"
COMPLETIONS_ENDPOINT="/v1/completions"
METRICS_ENDPOINT="/metrics/"
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ You can specify the launch arguments either via the CLI or a .env (see the `.env

After the bench suite ends, you obtain a folder containing :

- The results of all the benchmarks (in the folder `raw_results` )
- The results of all the benchmarks (in the zip file `raw_results.zip`)
- A folder `report` containing the aggregation of all the individual benchmarks. More specifically:
- `parameters.json` containing all the parameters for the bench, in particular, the arguments used to launch the `happy_vllm` API
- `prompt_ingestion_graph.png` containing the graph of the speed of prompt ingestion by the model. It is the time taken to produce the first token vs the length of the prompt. The speed is the slope of this line and is indicated in the title of the graph. The data used for this graph is contained in the `data` folder.
Expand All @@ -60,13 +60,15 @@ Here is a list of the arguments:
- `port` : The port of the API (if you specify a base-url, you don't need to specify a port)
- `dataset-folder` : The folder where the datasets for querying the API are (by default, it is in `datasets`)
- `output-folder` : The folder where the results will be written (by default in the `results` folder)
- `gpu-name`: The name of the GPU on which the model is (default `None`)
- `step-live-metrics` : The time, in second, between two querying of the `/metrics/` endpoint of happy_vllm (default `0.01`)
- `max-queries` : The maximal number of query for each bench (default `1000`)
- `max-duration-prompt-ingestion` : The max duration (in seconds) for the execution of an individual script benchmarking the prompt ingestion ( default `900`)
- `max-duration-kv-cache-profile` : The max duration (in seconds) for the execution of an individual script benchmarking the KV cache usage ( default `900`)
- `max-duration-speed-generation` : The max duration (in seconds) for the execution of an individual script benchmarking the speed generation ( default `900`). It is also the max duration permitted for the launch of all the scripts benchmarking the speed generation for a given couple of input length/output length.
- `min-duration-speed-generation` : For each individual script benchmarking the speed generation, if this min duration (in seconds) is reached and the target-queries-nb is also reached, the script will end (default `60`)
- `target-queries-nb-speed-generation` : For each individual script benchmarking the speed generation, if this target-queries-nb is reached and the min-duration is also reached, the script will end (default `100`)
- `min-number-of-valid-queries`: The minimal number of valid queries that should be present in a file to be considered for graph drawing (default `50`)
- `backend` : For now, only happy_vllm is supported.
- `completions-endpoint` : The endpoint for completions (default `/v1/completions`)
- `metrics-endpoint` : The endpoint for the metrics (default `/metrics/`)
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ dependencies = [
"matplotlib>=3.8.4,<4.0",
"pydantic>=2.7.1,<3.0",
"pydantic-settings>=2.2.1,<3.0",
"requests>=2.31.0,<3.0"
"requests>=2.32.0,<3.0",
"mdutils>=1.6.0,<2.0"
]
classifiers = [
"Programming Language :: Python :: 3",
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ prometheus_client==0.20.0
matplotlib==3.8.4
pydantic==2.7.1
pydantic-settings==2.2.1
requests==2.31.0
requests==2.32.0
mdutils==1.6.0
37 changes: 30 additions & 7 deletions src/benchmark_llm_serving/bench_suite.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import random
import string
import shutil
import logging
import argparse
from pathlib import Path
Expand All @@ -10,6 +11,7 @@

from benchmark_llm_serving import utils
from benchmark_llm_serving.io_classes import QueryInput
from benchmark_llm_serving.make_readmes import make_readme
from benchmark_llm_serving.make_graphs import draw_and_save_graphs
from benchmark_llm_serving.benchmark import launch_benchmark, augment_dataset
from benchmark_llm_serving.utils_args import get_parser_base_arguments, add_arguments_to_parser
Expand Down Expand Up @@ -64,6 +66,8 @@ class BenchmarkSettings(BaseSettings):
host: str = "localhost"
port: int = 8000

gpu_name: Optional[str] = None

dataset_folder: str = "datasets"
output_folder: str = "results"

Expand All @@ -77,6 +81,8 @@ class BenchmarkSettings(BaseSettings):
min_duration_speed_generation: int = 60
target_queries_nb_speed_generation: int = 100

min_number_of_valid_queries: int = 50

backend: str = "happy_vllm"
completions_endpoint: str = "/v1/completions"
metrics_endpoint: str = "/metrics/"
Expand All @@ -96,6 +102,7 @@ def main():
parser.add_argument("--min-duration-speed-generation", type=int, help="The min duration for the speed generation")
parser.add_argument("--target-queries-nb-speed_generation", type=int, help="The target_queries for the speed generation")
parser.add_argument("--speed-threshold", type=float, help="Accepted threshold for generation speed")
parser.add_argument("--min-number-of-valid-queries", type=int, help="The minimal number of queries needed to consider a file for drawing the graphs")
parser.set_defaults(**bench_settings.model_dump())

parser = add_arguments_to_parser(parser)
Expand All @@ -112,9 +119,9 @@ def main():
current_directory = Path(os.path.dirname(os.path.realpath(__file__)))
grand_parent_directory = current_directory.parent.parent.absolute()
output_folder = os.path.join(grand_parent_directory, args.output_folder)
raw_result_folder = os.path.join(output_folder, "raw_results")
if not os.path.isdir(raw_result_folder):
os.makedirs(raw_result_folder)
raw_results_folder = os.path.join(output_folder, "raw_results")
if not os.path.isdir(raw_results_folder):
os.makedirs(raw_results_folder)

input_lengths = ["32", "1024", "4096"]
output_lengths = [16, 128, 1024]
Expand Down Expand Up @@ -151,7 +158,7 @@ def main():
for i in range(4):
now = utils.get_now()
logger.info(f"{now} Benchmark for the prompt ingestion speed : instance {i} ")
args.output_file = os.path.join(raw_result_folder, f"prompt_ingestion_{i}.json")
args.output_file = os.path.join(raw_results_folder, f"prompt_ingestion_{i}.json")
dataset = add_prefixes_to_dataset(datasets[args.prompt_length], 4)
launch_benchmark(args, dataset, suite_id)
now = utils.get_now()
Expand All @@ -171,7 +178,7 @@ def main():
for input_length, output_length in input_output_lengths:
args.prompt_length = input_length
args.output_length = output_length
args.output_file = os.path.join(raw_result_folder, f"kv_cache_profile_input_{input_length}_output_{output_length}.json")
args.output_file = os.path.join(raw_results_folder, f"kv_cache_profile_input_{input_length}_output_{output_length}.json")
now = utils.get_now()
dataset = add_prefixes_to_dataset(datasets[args.prompt_length], 4)
logger.info(f"{now} Beginning the benchmark for the KV cache profile, input length : {input_length}, output_length : {output_length}")
Expand Down Expand Up @@ -200,7 +207,7 @@ def main():
args.prompt_length = input_length
args.output_length = output_length
args.n_workers = nb_constant_requests
args.output_file = os.path.join(raw_result_folder, f"generation_speed_input_{input_length}_output_{output_length}_nb_requests_{nb_constant_requests}.json")
args.output_file = os.path.join(raw_results_folder, f"generation_speed_input_{input_length}_output_{output_length}_nb_requests_{nb_constant_requests}.json")
now = utils.get_now()
logger.info(f"{now} Benchmarks for the generation speed, input length : {input_length}, output_length : {output_length}, nb_requests : {nb_constant_requests}")
dataset = add_prefixes_to_dataset(datasets[args.prompt_length], 4)
Expand All @@ -217,10 +224,26 @@ def main():

now = utils.get_now()
logger.info(f"{now} Drawing graphs")
draw_and_save_graphs(output_folder, speed_threshold=args.speed_threshold)
draw_and_save_graphs(output_folder, speed_threshold=args.speed_threshold, gpu_name=args.gpu_name,
min_number_of_valid_queries=args.min_number_of_valid_queries)
now = utils.get_now()
logger.info(f"{now} Drawing graphs : DONE")

now = utils.get_now()
logger.info(f"{now} Making readme")
make_readme(output_folder)
now = utils.get_now()
logger.info(f"{now} Making readme : DONE")

now = utils.get_now()
logger.info(f"{now} Zipping raw_results folder")
shutil.make_archive(os.path.join(output_folder, "raw_results"), 'zip', raw_results_folder)
shutil.rmtree(raw_results_folder)
now = utils.get_now()
logger.info(f"{now} Zipping raw_results folder : DONE")

now = utils.get_now()
logger.info(f"{now} Everything : DONE")

if __name__ == "__main__":
main()
30 changes: 19 additions & 11 deletions src/benchmark_llm_serving/make_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import logging
import argparse
import numpy as np
from typing import Any
from pathlib import Path
from typing import Any, Union
from matplotlib import pyplot as plt
from pydantic_settings import BaseSettings, SettingsConfigDict

Expand All @@ -28,6 +28,7 @@ class GraphsSettings(BaseSettings):
"""
output_folder: str = "results"
speed_threshold: float = 20.0
min_number_of_valid_queries: int = 50

model_config = SettingsConfigDict(env_file=".env", extra='ignore', protected_namespaces=('settings', ))

Expand Down Expand Up @@ -79,7 +80,7 @@ def make_prompt_ingestion_graph(files: dict, report_folder: str) -> None:

plt.title(f"Model : {model_name} \n Prompt ingestion speed : {prompt_ingestion_coefficient} tokens per second", fontsize='16')
# Save plot
plt.savefig(os.path.join(report_folder, 'prompt_ingestion_graph.png'),bbox_inches='tight',dpi=200)
plt.savefig(os.path.join(report_folder, 'prompt_ingestion_graph.png'),bbox_inches='tight',dpi=75)

# Save data
graph_data = {"fit_coefficients": list(second_fit_coefficients),
Expand Down Expand Up @@ -189,7 +190,7 @@ def make_speed_generation_graph_for_one_input_output(input_length: int, output_l

plt.title(f"Model : {model_name} \n Speed generation | input length: {input_length} | output length : {output_length}", fontsize='16')
# Save graph
plt.savefig(os.path.join(report_folder, "speed_generation", f'speed_generation_graph_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=200)
plt.savefig(os.path.join(report_folder, "speed_generation", f'speed_generation_graph_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=75)

# Save data
with open(os.path.join(report_folder, "speed_generation", "data", f"speed_generation_graph_data_input_{input_length}_output_{output_length}.json"), 'w') as json_file:
Expand Down Expand Up @@ -288,7 +289,7 @@ def make_kv_cache_profile_graph_for_one_input_output(input_length: int, output_l

plt.title(f"Model : {model_name} \n KV cache profile | input length: {input_length} | output length : {output_length}", fontsize='16')
# Save graph
plt.savefig(os.path.join(report_folder, "kv_cache_profile", f'graph_kv_cache_profile_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=200)
plt.savefig(os.path.join(report_folder, "kv_cache_profile", f'graph_kv_cache_profile_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=75)


def make_kv_cache_profile_graphs(files: dict, report_folder: str) -> None:
Expand Down Expand Up @@ -368,7 +369,7 @@ def make_total_speed_generation_graph(files: dict, report_folder: str) -> None:

plt.legend()

plt.savefig(os.path.join(report_folder, f'total_speed_generation_graph.png'), bbox_inches='tight',dpi=200)
plt.savefig(os.path.join(report_folder, f'total_speed_generation_graph.png'), bbox_inches='tight',dpi=75)


# Save data
Expand All @@ -381,12 +382,13 @@ def make_total_speed_generation_graph(files: dict, report_folder: str) -> None:
json.dump(data_to_save, json_file, indent=4)


def save_common_parameters(files: dict, report_folder: str):
def save_common_parameters(files: dict, report_folder: str, gpu_name: str):
"""Saves the common parameters of all the benchmarks.
Args:
files (dict) : The files containing the results of the benchmarks
report_folder (str) : The folder where the report should be written
gpu_name (str) : The name of the GPU
"""
common_parameters: dict[str, Any] = {}
for results in files.values():
Expand All @@ -400,18 +402,21 @@ def save_common_parameters(files: dict, report_folder: str):
if key in common_parameters:
if value != common_parameters[key]:
common_parameters.pop(key)
common_parameters["gpu_name"] = gpu_name

with open(os.path.join(report_folder, 'parameters.json'), 'w') as json_file:
json.dump(common_parameters, json_file, indent=4)


def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0):
def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0, gpu_name: Union[str, None] = None,
min_number_of_valid_queries: int = 50):
"""Draws and saves all the graphs and corresponding data for benchmark results
obtained via bench_suite.py
Args:
output_folder (str) : The folder where the results of the benchmarks are
speed_threshold (float) : The accepted speed generation to fix the threshold
gpu_name (str) : The name of the gpu
"""
# Manage output path
if not os.path.isabs(output_folder):
Expand Down Expand Up @@ -440,7 +445,7 @@ def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0):
with open(os.path.join(raw_result_folder, filename), 'r') as json_file:
files[filename] = json.load(json_file)
files = {key: value for key, value in files.items()
if value['general_metrics']['total_number_of_queries'] - value['general_metrics']['nb_errored_queries'] + value['general_metrics']['nb_timeout_queries'] > 50}
if value['general_metrics']['total_number_of_queries'] - value['general_metrics']['nb_errored_queries'] + value['general_metrics']['nb_timeout_queries'] >= min_number_of_valid_queries}

now = utils.get_now()
logger.info(f"{now} Making prompt ingestion graph")
Expand All @@ -454,16 +459,19 @@ def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0):
now = utils.get_now()
logger.info(f"{now} Making total speed generation graph")
make_total_speed_generation_graph(files, report_folder)
save_common_parameters(files, report_folder)
save_common_parameters(files, report_folder, gpu_name)
now = utils.get_now()
logger.info(f"{now} Graphs done")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark script")
parser = argparse.ArgumentParser(description="Graphs script")
parser.add_argument("--output-folder", type=str, help="Path to the output folder")
parser.add_argument("--speed-threshold", type=float, default=20.0, help="Accepted threshold for generation speed")
parser.add_argument("--gpu-name", help="The name of the GPU")
parser.add_argument("--min-number-of-valid-queries", type=int, help="The minimal number of queries needed to consider a file for drawing the graphs")
graph_settings = GraphsSettings()
parser.set_defaults(**graph_settings.model_dump())
args = parser.parse_args()
draw_and_save_graphs(output_folder=args.output_folder, speed_threshold=args.speed_threshold)
draw_and_save_graphs(output_folder=args.output_folder, speed_threshold=args.speed_threshold, gpu_name=args.gpu_name,
min_number_of_valid_queries=args.min_number_of_valid_queries)
Loading

0 comments on commit 5e5ac33

Please sign in to comment.