Skip to content

Commit

Permalink
add torch inference benchmark (#940)
Browse files Browse the repository at this point in the history
* add torch inference benchmark

* add torch benchmark test

* code review

* code review

* update model path

* code clean up

* update torch value

* code clean

* code clean

* code opt

* code opt

* code refactor

* code opt

* add other test

* update model

* add d_type

* fix torch dtype

* rm dtype

* cleanup

* cleanup

* code opt

* code clean up

---------

Co-authored-by: LRL-ModelCloud <lrl@lbx.dev>
  • Loading branch information
CL-ModelCloud and LRL-ModelCloud authored Dec 20, 2024
1 parent 2926435 commit 2707ac7
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 4 deletions.
5 changes: 2 additions & 3 deletions gptqmodel/models/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,9 @@ def from_quantized(
**cached_file_kwargs,
)

if torch_dtype is None or torch_dtype == "auto" or device == DEVICE.XPU:
if torch_dtype is None or torch_dtype == "auto":
# TODO FIX ME for `dynamic`, non-quantized modules should be in native type
torch_dtype = torch.float16
# auto_dtype_from_config(config=config, device=device, device_map=device_map)
torch_dtype = auto_dtype_from_config(config=config, device=device, device_map=device_map)
elif not isinstance(torch_dtype, torch.dtype):
raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.")

Expand Down
5 changes: 4 additions & 1 deletion gptqmodel/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,10 @@ def get_checkpoints(model_id_or_path: str, extensions: List[str], possible_model
# return the most stable tensor dtype for quantization while minimizing vram
def auto_dtype_from_config(config: PretrainedConfig, device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, device: Optional[Union[str, int]] = None ) -> torch.dtype:
# TODO mps has errors with bfloat16, lock to float16 for now
if sys.platform == "darwin" or "mps" in [device, device_map] or (isinstance(device_map, Dict) and "mps" in device_map.values()):
if sys.platform == "darwin" or "mps" in [device, device_map] or (
isinstance(device_map, Dict) and "mps" in device_map.values()):
return torch.float16
elif device == DEVICE.XPU:
return torch.float16

dtype = getattr(config, "torch_dtype")
Expand Down
70 changes: 70 additions & 0 deletions tests/benchmark/benchmark_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import time

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

import unittest # noqa: E402
from transformers import AutoTokenizer # noqa: E402
from gptqmodel import GPTQModel # noqa: E402
from gptqmodel.utils.progress import ProgressBar


class BenchmarkTest(unittest.TestCase):
MODEL_id = "/monster/data/model/gptq_4bits_11-21_15-47-09_maxlen2048_ns2048_descFalse_damp0.1"
MIN_NEW_TOEKNS = 100
NUM_RUNS = 10
PROMPTS = [
"I am in Paris and I",
"The capital of the United Kingdom is",
"The largest ocean on Earth is",
"The world’s longest river is",
"The tallest mountain in the world is",
"The currency used in Japan is",
"How to consult a dictionary?",
"What is the boiling point of water in degrees Celsius?",
"Which is the most widely used Internet search engine in the world?",
"What is the official language of France?",
]
MAX_DELTA_FLOOR_PERCENT = 0.15
MAX_POSITIVE_DELTA_CEIL_PERCENT = 1.0

def benchmark(self, backend, device, tokens_per_second):
model = GPTQModel.from_quantized(
self.MODEL_id,
device=device,
backend=backend,
)

tokenizer = AutoTokenizer.from_pretrained(self.MODEL_id)
tokenizer.pad_token = tokenizer.eos_token
inp = tokenizer(self.PROMPTS, padding=True, truncation=True, return_tensors="pt", padding_side='left').to(device)

times = []
pb = ProgressBar(range(self.NUM_RUNS))
for i in pb:
pb.set_description(f"run index {i} of {self.NUM_RUNS -1}")
start_time = time.time()
res = model.generate(**inp, num_beams=1, min_new_tokens=self.MIN_NEW_TOEKNS,
max_new_tokens=self.MIN_NEW_TOEKNS)
end_time = time.time()

elapsed_time = end_time - start_time
times.append(elapsed_time)

sum_time = sum(times)
sum_tokens = len(self.PROMPTS) * self.MIN_NEW_TOEKNS * self.NUM_RUNS
avg_tokens_per_second = sum_tokens / sum_time

print("**************** Benchmark Result Info****************")
print(f"Times: {times}")
print(f"Sum Times: {sum_time}")
print(f"Sum New Tokens: {sum_tokens}")
print(f"Benchmark Result: {avg_tokens_per_second} token/s")
print("**************** Benchmark Result Info End****************")

diff_pct = (avg_tokens_per_second / tokens_per_second) * 100
negative_pct = 100 * (1 - self.MAX_DELTA_FLOOR_PERCENT)
positive_pct = 100 * (1 + self.MAX_POSITIVE_DELTA_CEIL_PERCENT)

self.assertTrue(negative_pct <= diff_pct <= positive_pct,
f"Tokens Per Second: {avg_tokens_per_second} diff {diff_pct:.2f}% is out of the expected range [{negative_pct}-{positive_pct}%]")
16 changes: 16 additions & 0 deletions tests/benchmark/test_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from benchmark_test import BenchmarkTest
from gptqmodel import BACKEND
from parameterized import parameterized # noqa: E402


class TestInference(BenchmarkTest):
@parameterized.expand(
[
(BACKEND.TORCH, 'cuda', 49.20),
# (BACKEND.TORCH, 'cpu', 49.20),
# (BACKEND.TORCH, 'xpu', 49.20),
# (BACKEND.TORCH, 'mps', 49.20),
]
)
def test_inference(self, backend, device, tokens_per_second):
self.benchmark(backend=backend, device=device, tokens_per_second=tokens_per_second)

0 comments on commit 2707ac7

Please sign in to comment.