code review

CL-ModelCloud · Jan 24, 2025 · caefce5 · caefce5
1 parent 4e7ee16
commit caefce5
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 2 deletions.
diff --git a/tests/inference_speed.py b/tests/inference_speed.py
@@ -46,7 +46,7 @@ class InferenceSpeed(unittest.TestCase):
     MAX_DELTA_FLOOR_PERCENT = 0.25
     MAX_POSITIVE_DELTA_CEIL_PERCENT = 0.25
 
-    def inference(self, model_path, backend, tokens_per_second):
+    def inference(self, model_path, backend, tokens_per_second, assert_result=True):
         model = GPTQModel.from_quantized(
             model_path,
             backend=backend,
@@ -86,6 +86,13 @@ def inference(self, model_path, backend, tokens_per_second):
         print(f"New Token Per Second: {avg_tokens_per_second} token/s")
         print(f"****************  {backend} Result Info End****************")
 
+        # There are differences between the results of the first and second runs of bitblas
+        # (there is a cache when running bitblas for the second time),
+        # so only the results of the second run of bitblas are asserted.
+        # The first run of bitblas only prints relevant information
+        if not assert_result:
+            return
+
         diff_pct = (avg_tokens_per_second / tokens_per_second) * 100
         negative_pct = 100 * (1 - self.MAX_DELTA_FLOOR_PERCENT)
         positive_pct = 100 * (1 + self.MAX_POSITIVE_DELTA_CEIL_PERCENT)

diff --git a/tests/test_inference_speed.py b/tests/test_inference_speed.py
@@ -30,9 +30,11 @@ class TestInferenceSpeed(InferenceSpeed):
             (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 775),
             (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 296),
             (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 295),
-            (InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 724), # First time running bitblas
             (InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 1474), # Second time running bitblas, there is cache
         ]
     )
     def test_inference_speed(self, model_path, backend, tokens_per_second):
+        if backend == BACKEND.BITBLAS:
+            self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second, assert_result=False)
+
         self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second)