Add readme generation to benchmarks (#1)

* Added readme script * Added the docstrings * Added gpu_name and tests * Added min-number-of-valid-queries
France-Travail · Jul 3, 2024 · 5e5ac33 · 5e5ac33
1 parent 075a8a1
commit 5e5ac33
Show file tree

Hide file tree

Showing 13 changed files with 348 additions and 21 deletions.
diff --git a/.env.example b/.env.example
@@ -6,6 +6,8 @@ PORT=8000
 DATASET_FOLDER="datasets"
 OUTPUT_FOLDER="results"
 
+GPU_NAME="A100-80g"
+
 STEP_LIVE_METRICS=0.01
 MAX_QUERIES=1000
 
@@ -15,6 +17,8 @@ MAX_DURATION_SPEED_GENERATION=6000
 MIN_DURATION_SPEED_GENERATION=300
 TARGET_QUERIES_NB_SPEED_GENERATION=60
 
+MIN_NUMBER_OF_VALID_QUERIES=50
+
 BACKEND="happy_vllm"
 COMPLETIONS_ENDPOINT="/v1/completions"
 METRICS_ENDPOINT="/metrics/"

diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ You can specify the launch arguments either via the CLI or a .env (see the `.env
 
 After the bench suite ends, you obtain a folder containing :
 
- - The results of all the benchmarks (in the folder `raw_results` )
+ - The results of all the benchmarks (in the zip file `raw_results.zip`)
  - A folder `report` containing the aggregation of all the individual benchmarks. More specifically:
    - `parameters.json` containing all the parameters for the bench, in particular, the arguments used to launch the `happy_vllm` API
    - `prompt_ingestion_graph.png` containing the graph of the speed of prompt ingestion by the model. It is the time taken to produce the first token vs the length of the prompt. The speed is the slope of this line and is indicated in the title of the graph. The data used for this graph is contained in the `data` folder.
@@ -60,13 +60,15 @@ Here is a list of the arguments:
  - `port` : The port of the API (if you specify a base-url, you don't need to specify a port)
  - `dataset-folder` : The folder where the datasets for querying the API are (by default, it is in `datasets`)
  - `output-folder` : The folder where the results will be written (by default in the `results` folder)
+ - `gpu-name`: The name of the GPU on which the model is (default `None`)
  - `step-live-metrics` : The time, in second, between two querying of the `/metrics/` endpoint of happy_vllm (default `0.01`)
  - `max-queries` : The maximal number of query for each bench (default `1000`)
  - `max-duration-prompt-ingestion` : The max duration (in seconds) for the execution of an individual script benchmarking the prompt ingestion ( default `900`)
  - `max-duration-kv-cache-profile` : The max duration (in seconds) for the execution of an individual script benchmarking the KV cache usage ( default `900`)
  - `max-duration-speed-generation` : The max duration (in seconds) for the execution of an individual script benchmarking the speed generation ( default `900`). It is also the max duration permitted for the launch of all the scripts benchmarking the speed generation for a given couple of input length/output length.
  - `min-duration-speed-generation` : For each individual script benchmarking the speed generation, if this min duration (in seconds) is reached and the target-queries-nb is also reached, the script will end (default `60`)
  - `target-queries-nb-speed-generation` : For each individual script benchmarking the speed generation, if this target-queries-nb is reached and the min-duration is also reached, the script will end (default `100`)
+ - `min-number-of-valid-queries`: The minimal number of valid queries that should be present in a file to be considered for graph drawing (default `50`)
  - `backend` : For now, only happy_vllm is supported. 
  - `completions-endpoint` : The endpoint for completions (default `/v1/completions`)
  - `metrics-endpoint` : The endpoint for the metrics (default `/metrics/`)

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,8 @@ dependencies = [
     "matplotlib>=3.8.4,<4.0",
     "pydantic>=2.7.1,<3.0",
     "pydantic-settings>=2.2.1,<3.0",
-    "requests>=2.31.0,<3.0"
+    "requests>=2.32.0,<3.0",
+    "mdutils>=1.6.0,<2.0"
 ]
 classifiers = [
     "Programming Language :: Python :: 3",

diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ prometheus_client==0.20.0
 matplotlib==3.8.4
 pydantic==2.7.1
 pydantic-settings==2.2.1
-requests==2.31.0
+requests==2.32.0
+mdutils==1.6.0
diff --git a/src/benchmark_llm_serving/bench_suite.py b/src/benchmark_llm_serving/bench_suite.py
@@ -1,6 +1,7 @@
 import os
 import random
 import string
+import shutil
 import logging
 import argparse
 from pathlib import Path
@@ -10,6 +11,7 @@
 
 from benchmark_llm_serving import utils
 from benchmark_llm_serving.io_classes import QueryInput
+from benchmark_llm_serving.make_readmes import make_readme
 from benchmark_llm_serving.make_graphs import draw_and_save_graphs
 from benchmark_llm_serving.benchmark import launch_benchmark, augment_dataset
 from benchmark_llm_serving.utils_args import get_parser_base_arguments, add_arguments_to_parser
@@ -64,6 +66,8 @@ class BenchmarkSettings(BaseSettings):
     host: str = "localhost"
     port: int = 8000
 
+    gpu_name: Optional[str] = None
+
     dataset_folder: str = "datasets"
     output_folder: str = "results"
 
@@ -77,6 +81,8 @@ class BenchmarkSettings(BaseSettings):
     min_duration_speed_generation: int = 60
     target_queries_nb_speed_generation: int = 100
 
+    min_number_of_valid_queries: int = 50
+
     backend: str = "happy_vllm"
     completions_endpoint: str = "/v1/completions"
     metrics_endpoint: str = "/metrics/"
@@ -96,6 +102,7 @@ def main():
     parser.add_argument("--min-duration-speed-generation", type=int, help="The min duration for the speed generation")
     parser.add_argument("--target-queries-nb-speed_generation", type=int, help="The target_queries for the speed generation")
     parser.add_argument("--speed-threshold", type=float, help="Accepted threshold for generation speed")
+    parser.add_argument("--min-number-of-valid-queries", type=int, help="The minimal number of queries needed to consider a file for drawing the graphs")
     parser.set_defaults(**bench_settings.model_dump())
 
     parser = add_arguments_to_parser(parser)
@@ -112,9 +119,9 @@ def main():
         current_directory = Path(os.path.dirname(os.path.realpath(__file__)))
         grand_parent_directory = current_directory.parent.parent.absolute()
         output_folder = os.path.join(grand_parent_directory, args.output_folder)
-    raw_result_folder = os.path.join(output_folder, "raw_results")
-    if not os.path.isdir(raw_result_folder):
-        os.makedirs(raw_result_folder)
+    raw_results_folder = os.path.join(output_folder, "raw_results")
+    if not os.path.isdir(raw_results_folder):
+        os.makedirs(raw_results_folder)
 
     input_lengths = ["32", "1024", "4096"]
     output_lengths = [16, 128, 1024]
@@ -151,7 +158,7 @@ def main():
     for i in range(4):
         now = utils.get_now()
         logger.info(f"{now} Benchmark for the prompt ingestion speed : instance {i} ")
-        args.output_file = os.path.join(raw_result_folder, f"prompt_ingestion_{i}.json")
+        args.output_file = os.path.join(raw_results_folder, f"prompt_ingestion_{i}.json")
         dataset = add_prefixes_to_dataset(datasets[args.prompt_length], 4)
         launch_benchmark(args, dataset, suite_id)
         now = utils.get_now()
@@ -171,7 +178,7 @@ def main():
     for input_length, output_length in input_output_lengths:
         args.prompt_length = input_length
         args.output_length = output_length
-        args.output_file = os.path.join(raw_result_folder, f"kv_cache_profile_input_{input_length}_output_{output_length}.json")
+        args.output_file = os.path.join(raw_results_folder, f"kv_cache_profile_input_{input_length}_output_{output_length}.json")
         now = utils.get_now()
         dataset = add_prefixes_to_dataset(datasets[args.prompt_length], 4)
         logger.info(f"{now} Beginning the benchmark for the KV cache profile, input length : {input_length}, output_length : {output_length}")
@@ -200,7 +207,7 @@ def main():
                 args.prompt_length = input_length
                 args.output_length = output_length
                 args.n_workers = nb_constant_requests
-                args.output_file = os.path.join(raw_result_folder, f"generation_speed_input_{input_length}_output_{output_length}_nb_requests_{nb_constant_requests}.json")
+                args.output_file = os.path.join(raw_results_folder, f"generation_speed_input_{input_length}_output_{output_length}_nb_requests_{nb_constant_requests}.json")
                 now = utils.get_now()
                 logger.info(f"{now} Benchmarks for the generation speed, input length : {input_length}, output_length : {output_length}, nb_requests : {nb_constant_requests}")
                 dataset = add_prefixes_to_dataset(datasets[args.prompt_length], 4)
@@ -217,10 +224,26 @@ def main():
 
     now = utils.get_now()
     logger.info(f"{now} Drawing graphs")
-    draw_and_save_graphs(output_folder, speed_threshold=args.speed_threshold)
+    draw_and_save_graphs(output_folder, speed_threshold=args.speed_threshold, gpu_name=args.gpu_name,
+                        min_number_of_valid_queries=args.min_number_of_valid_queries)
     now = utils.get_now()
     logger.info(f"{now} Drawing graphs : DONE")
 
+    now = utils.get_now()
+    logger.info(f"{now} Making readme")
+    make_readme(output_folder)
+    now = utils.get_now()
+    logger.info(f"{now} Making readme : DONE")
+
+    now = utils.get_now()
+    logger.info(f"{now} Zipping raw_results folder")
+    shutil.make_archive(os.path.join(output_folder, "raw_results"), 'zip', raw_results_folder)
+    shutil.rmtree(raw_results_folder)
+    now = utils.get_now()
+    logger.info(f"{now} Zipping raw_results folder : DONE")
+
+    now = utils.get_now()
+    logger.info(f"{now} Everything : DONE")
 
 if __name__ == "__main__":
     main()
diff --git a/src/benchmark_llm_serving/make_graphs.py b/src/benchmark_llm_serving/make_graphs.py
@@ -4,8 +4,8 @@
 import logging
 import argparse
 import numpy as np
-from typing import Any
 from pathlib import Path
+from typing import Any, Union
 from matplotlib import pyplot as plt
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
@@ -28,6 +28,7 @@ class GraphsSettings(BaseSettings):
     """
     output_folder: str = "results"
     speed_threshold: float = 20.0
+    min_number_of_valid_queries: int = 50
 
     model_config = SettingsConfigDict(env_file=".env", extra='ignore', protected_namespaces=('settings', ))
 
@@ -79,7 +80,7 @@ def make_prompt_ingestion_graph(files: dict, report_folder: str) -> None:
 
     plt.title(f"Model : {model_name} \n Prompt ingestion speed : {prompt_ingestion_coefficient} tokens per second", fontsize='16')
     # Save plot
-    plt.savefig(os.path.join(report_folder, 'prompt_ingestion_graph.png'),bbox_inches='tight',dpi=200)
+    plt.savefig(os.path.join(report_folder, 'prompt_ingestion_graph.png'),bbox_inches='tight',dpi=75)
 
     # Save data
     graph_data = {"fit_coefficients": list(second_fit_coefficients),
@@ -189,7 +190,7 @@ def make_speed_generation_graph_for_one_input_output(input_length: int, output_l
 
     plt.title(f"Model : {model_name} \n Speed generation | input length: {input_length} | output length : {output_length}", fontsize='16')
     # Save graph
-    plt.savefig(os.path.join(report_folder, "speed_generation", f'speed_generation_graph_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=200)
+    plt.savefig(os.path.join(report_folder, "speed_generation", f'speed_generation_graph_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=75)
 
     # Save data
     with open(os.path.join(report_folder, "speed_generation", "data", f"speed_generation_graph_data_input_{input_length}_output_{output_length}.json"), 'w') as json_file:
@@ -288,7 +289,7 @@ def make_kv_cache_profile_graph_for_one_input_output(input_length: int, output_l
 
     plt.title(f"Model : {model_name} \n KV cache profile | input length: {input_length} | output length : {output_length}", fontsize='16')
     # Save graph
-    plt.savefig(os.path.join(report_folder, "kv_cache_profile", f'graph_kv_cache_profile_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=200)
+    plt.savefig(os.path.join(report_folder, "kv_cache_profile", f'graph_kv_cache_profile_input_{input_length}_output_{output_length}.png'), bbox_inches='tight',dpi=75)
 
 
 def make_kv_cache_profile_graphs(files: dict, report_folder: str) -> None:
@@ -368,7 +369,7 @@ def make_total_speed_generation_graph(files: dict, report_folder: str) -> None:
 
     plt.legend()
 
-    plt.savefig(os.path.join(report_folder, f'total_speed_generation_graph.png'), bbox_inches='tight',dpi=200)
+    plt.savefig(os.path.join(report_folder, f'total_speed_generation_graph.png'), bbox_inches='tight',dpi=75)
 
 
     # Save data
@@ -381,12 +382,13 @@ def make_total_speed_generation_graph(files: dict, report_folder: str) -> None:
         json.dump(data_to_save, json_file, indent=4)
 
 
-def save_common_parameters(files: dict, report_folder: str):
+def save_common_parameters(files: dict, report_folder: str, gpu_name: str):
     """Saves the common parameters of all the benchmarks.
 
     Args:
         files (dict) : The files containing the results of the benchmarks
         report_folder (str) : The folder where the report should be written
+        gpu_name (str) : The name of the GPU
     """
     common_parameters: dict[str, Any] = {}
     for results in files.values():
@@ -400,18 +402,21 @@ def save_common_parameters(files: dict, report_folder: str):
                 if key in common_parameters:
                     if value != common_parameters[key]:
                         common_parameters.pop(key)
+    common_parameters["gpu_name"] = gpu_name
 
     with open(os.path.join(report_folder, 'parameters.json'), 'w') as json_file:
         json.dump(common_parameters, json_file, indent=4)
 
 
-def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0):
+def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0, gpu_name: Union[str, None] = None,
+                         min_number_of_valid_queries: int = 50):
     """Draws and saves all the graphs and corresponding data for benchmark results 
     obtained via bench_suite.py
 
     Args:
         output_folder (str) : The folder where the results of the benchmarks are
         speed_threshold (float) : The accepted speed generation to fix the threshold
+        gpu_name (str) : The name of the gpu
     """
     # Manage output path
     if not os.path.isabs(output_folder):
@@ -440,7 +445,7 @@ def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0):
             with open(os.path.join(raw_result_folder, filename), 'r') as json_file:
                 files[filename] = json.load(json_file)
     files = {key: value for key, value in files.items() 
-             if value['general_metrics']['total_number_of_queries'] - value['general_metrics']['nb_errored_queries'] + value['general_metrics']['nb_timeout_queries'] > 50}
+             if value['general_metrics']['total_number_of_queries'] - value['general_metrics']['nb_errored_queries'] + value['general_metrics']['nb_timeout_queries'] >= min_number_of_valid_queries}
 
     now = utils.get_now()
     logger.info(f"{now} Making prompt ingestion graph")
@@ -454,16 +459,19 @@ def draw_and_save_graphs(output_folder: str, speed_threshold: float = 20.0):
     now = utils.get_now()
     logger.info(f"{now} Making total speed generation graph")
     make_total_speed_generation_graph(files, report_folder)
-    save_common_parameters(files, report_folder)
+    save_common_parameters(files, report_folder, gpu_name)
     now = utils.get_now()
     logger.info(f"{now} Graphs done")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark script")
+    parser = argparse.ArgumentParser(description="Graphs script")
     parser.add_argument("--output-folder", type=str, help="Path to the output folder")
     parser.add_argument("--speed-threshold", type=float, default=20.0, help="Accepted threshold for generation speed")
+    parser.add_argument("--gpu-name", help="The name of the GPU")
+    parser.add_argument("--min-number-of-valid-queries", type=int, help="The minimal number of queries needed to consider a file for drawing the graphs")
     graph_settings = GraphsSettings()
     parser.set_defaults(**graph_settings.model_dump())
     args = parser.parse_args()
-    draw_and_save_graphs(output_folder=args.output_folder, speed_threshold=args.speed_threshold)
+    draw_and_save_graphs(output_folder=args.output_folder, speed_threshold=args.speed_threshold, gpu_name=args.gpu_name,
+                        min_number_of_valid_queries=args.min_number_of_valid_queries)