neuralmagic · varun-sundar-rabindranath · Feb 22, 2024 · Feb 13, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/neuralmagic/__init__.py b/neuralmagic/__init__.py
diff --git a/neuralmagic/benchmarks/README.md b/neuralmagic/benchmarks/README.md
@@ -0,0 +1,64 @@
+# Directory Structure:
+
+- scripts/*.py - Benchmark scripts that perform the metric computation.
+
+- configs/*.json - Config JSON files. These JSONs define what benchmark script to run and what combination of script parameters to use. 
+
+- *.py - Benchmark drivers. Given a config JSON, executes all the commands defined by the config JSON.
+
+# Run Benchmark scripts
+
+All `scripts/benchmark_*.py` files can be executed on their own.
+
+Run `python -m neuralmagic/benchmarks/scripts/* --help` for script description and How-To run.
+
+# Benchmarking drivers and Configs
+
+All the benchmark driver *.py files, input a JSON config file and an output directory path.
+
+As mentioned above, the config file defines what benchmark-script to run and what arguments to run it with.
+
+The following is an example config JSON,
+
+```
+		{
+			"description": "Benchmark vllm engine throughput - with dataset",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+			],
+			"sparsity" : [],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"dataset": [
+					"sharegpt",
+                    "ultrachat"
+				],
+				"output-len": [
+					128
+				],
+				"num-prompts": [
+					1000
+				],
+			}
+		}
+```
+This config tells the benchmark driver to run benchmark_throughput script on all the listed models with all possible script-args combinations.
+i.e. the config essentially translates to,
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model facebook/opt-125m --dataset sharegpt --output-len 128 --num-prompts 1000
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model facebook/opt-125m --dataset ultrachat --output-len 128 --num-prompts 1000
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dataset sharegpt --output-len 128 --num-prompts 1000
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dataset ultrachat --output-len 128 --num-prompts 1000
+
+# Benchmarking with driver
+```
+python3 -m neuralmagic.benchmarks.run_benchmarks -i <path-to-config-file> -o <output-directory-path>
+```
+
+# About sparsity
+The benchmark configs have a `sparsity` field. Populate this field with proper sparsity identifiers to inform vllm about model sparsity.
+For the list of valid sparsity args, check `vllm/model_executor/layers/sparsity/*`
diff --git a/neuralmagic/benchmarks/__init__.py b/neuralmagic/benchmarks/__init__.py
@@ -0,0 +1,4 @@
+from neuralmagic.benchmarks.run_benchmark_serving import run_benchmark_serving_script
+from neuralmagic.benchmarks.run_benchmark_throughput import run_benchmark_throughput_script
+
+__all__ = [run_benchmark_serving_script, run_benchmark_throughput_script]
diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
@@ -0,0 +1,62 @@
+import itertools
+import json
+
+from argparse import Namespace
+from pathlib import Path
+from typing import NamedTuple, Iterable
+# from neuralmagic.tools.call_cmd import call_cmd
+
+from vllm.model_executor.weight_utils import prepare_hf_model_weights
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def download_model(hf_model_id: str) -> None:
+    """
+     Downloads a hugging face model to cache
+     """
+    prepare_hf_model_weights(hf_model_id)
+    get_tokenizer(hf_model_id)
+
+
+def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
+    #config is a NamedTuple constructed from some JSON in neuralmagic/benchmarks/configs
+
+    kv = vars(config.script_args)
+
+    keys = kv.keys()
+    arg_lists = kv.values()
+    assert all(map(lambda le: isinstance(le, list), arg_lists))
+
+    # Empty lists are arguments without any values (e.g. boolean args)
+    key_args = []
+    for k, v in zip(keys, arg_lists):
+        if len(v) == 0:
+            key_args.append(k)
+
+    key_args_cla = list(map(lambda k: f"--{k}", key_args))
+
+    # Remove empty lists from arg_lists and remove key args from keys
+    arg_lists = filter(lambda arg_list: len(arg_list) != 0, arg_lists)
+    keys = filter(lambda k: k not in key_args, keys)
+
+    for args in itertools.product(*arg_lists):
+        cla = key_args_cla
+        for name, value in zip(keys, args):
+            cla.extend([f"--{name}", f"{value}"])
+        yield cla
+
+
+def benchmark_configs(config_file_path: Path) -> Iterable[NamedTuple]:
+    """
+    Give a path to a config file in `neuralmagic/benchmarks/configs/*` return an Iterable of
+    (sub)configs in the file
+    """
+    assert config_file_path.exists()
+
+    configs = None
+    with open(config_file_path, "r") as f:
+        configs = json.load(f, object_hook=lambda d: Namespace(**d))
+    assert configs is not None
+
+    for config in configs.configs:
+        yield config
diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -0,0 +1,56 @@
+{
+	"configs": [
+		{
+			"description": "Benchmark vllm serving",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+				"mistralai/Mistral-7B-Instruct-v0.2",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"sparsity": [],
+			"script_name": "benchmark_serving",
+			"script_args": {
+				"nr-qps-pair_": [
+					"50,0.5",
+					"100,1",
+					"200,2",
+					"500,5"
+				],
+				"best-of": [
+					1
+				],
+				"dataset": [
+					"sharegpt"
+				]
+			}
+		},
+		{
+			"description": "Benchmark vllm serving",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+				"mistralai/Mistral-7B-Instruct-v0.2",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"sparsity": [],
+			"script_name": "benchmark_serving",
+			"script_args": {
+				"num-prompts_": [
+					50,
+					100
+				],
+				"request-rate_": [
+					0.5,
+					"inf"
+				],
+				"best-of": [
+					1
+				],
+				"dataset": [
+					"sharegpt"
+				]
+			}
+		}
+	]
+}
diff --git a/neuralmagic/benchmarks/configs/benchmark_throughput.json b/neuralmagic/benchmarks/configs/benchmark_throughput.json
@@ -0,0 +1,124 @@
+{
+	"configs": [
+		{
+			"description": "Benchmark vllm engine throughput - with dataset",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+				"mistralai/Mistral-7B-Instruct-v0.2",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"backend": [
+					"vllm"
+				],
+				"dataset": [
+					"sharegpt"
+				],
+				"output-len": [
+					128
+				],
+				"tensor-parallel-size": [
+					1
+				],
+				"n": [
+					1
+				],
+				"num-prompts": [
+					1000
+				],
+				"seed": [
+					0
+				],
+				"dtype": [
+					"auto"
+				]
+			}
+		},
+		{
+			"description": "Benchmark vllm engine prefill throughput - synthetic",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+				"mistralai/Mistral-7B-Instruct-v0.2",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"backend": [
+					"vllm"
+				],
+				"input-len": [
+					1,
+					16,
+					32,
+					64,
+					128,
+					256,
+					512,
+					1024
+				],
+				"output-len": [
+					1
+				],
+				"tensor-parallel-size": [
+					1
+				],
+				"n": [
+					1
+				],
+				"num-prompts": [
+					1
+				],
+				"seed": [
+					0
+				],
+				"dtype": [
+					"auto"
+				]
+			}
+		},
+		{
+			"description": "Benchmark vllm engine decode throughput - synthetic",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+				"mistralai/Mistral-7B-Instruct-v0.2",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"backend": [
+					"vllm"
+				],
+				"input-len": [
+					2
+				],
+				"output-len": [
+					128
+				],
+				"tensor-parallel-size": [
+					1
+				],
+				"n": [
+					1
+				],
+				"num-prompts": [
+					1,
+					4,
+					8,
+					16,
+					32,
+					64
+				],
+				"seed": [
+					0
+				],
+				"dtype": [
+					"auto"
+				]
+			}
+		}
+	]
+}