Skip to content

Commit

Permalink
code review
Browse files Browse the repository at this point in the history
  • Loading branch information
CL-ModelCloud committed Jan 24, 2025
1 parent 4e7ee16 commit caefce5
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
9 changes: 8 additions & 1 deletion tests/inference_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class InferenceSpeed(unittest.TestCase):
MAX_DELTA_FLOOR_PERCENT = 0.25
MAX_POSITIVE_DELTA_CEIL_PERCENT = 0.25

def inference(self, model_path, backend, tokens_per_second):
def inference(self, model_path, backend, tokens_per_second, assert_result=True):
model = GPTQModel.from_quantized(
model_path,
backend=backend,
Expand Down Expand Up @@ -86,6 +86,13 @@ def inference(self, model_path, backend, tokens_per_second):
print(f"New Token Per Second: {avg_tokens_per_second} token/s")
print(f"**************** {backend} Result Info End****************")

# There are differences between the results of the first and second runs of bitblas
# (there is a cache when running bitblas for the second time),
# so only the results of the second run of bitblas are asserted.
# The first run of bitblas only prints relevant information
if not assert_result:
return

diff_pct = (avg_tokens_per_second / tokens_per_second) * 100
negative_pct = 100 * (1 - self.MAX_DELTA_FLOOR_PERCENT)
positive_pct = 100 * (1 + self.MAX_POSITIVE_DELTA_CEIL_PERCENT)
Expand Down
4 changes: 3 additions & 1 deletion tests/test_inference_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ class TestInferenceSpeed(InferenceSpeed):
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 775),
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 296),
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 295),
(InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 724), # First time running bitblas
(InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 1474), # Second time running bitblas, there is cache
]
)
def test_inference_speed(self, model_path, backend, tokens_per_second):
if backend == BACKEND.BITBLAS:
self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second, assert_result=False)

self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second)

0 comments on commit caefce5

Please sign in to comment.