inline quantization API in benchmarks

namgyu-youn · namgyu-youn · commit fbb2f2bc89b7 · 2025-10-03T03:19:17.000+09:00
diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py
@@ -18,26 +18,6 @@
 )
 
 
-def _int8wo_api(mod, **kwargs):
-    quantize_(mod, Int8WeightOnlyConfig(**kwargs), set_inductor_config=False)
-
-
-def _int8da_int8w_api(mod, **kwargs):
-    quantize_(
-        mod,
-        Int8DynamicActivationInt8WeightConfig(**kwargs),
-        set_inductor_config=False,
-    )
-
-
-def _int4wo_api(mod, **kwargs):
-    kwargs_copy = kwargs.copy()
-    if "groupsize" in kwargs_copy:
-        kwargs_copy["group_size"] = kwargs_copy["groupsize"]
-        del kwargs_copy["groupsize"]
-    quantize_(mod, Int4WeightOnlyConfig(**kwargs_copy), set_inductor_config=False)
-
-
 class ToyLinearModel(torch.nn.Module):
     """Single linear for m * k * n problem size"""
 
@@ -117,26 +97,14 @@ def _ref_change_linear_weights_to_woqtensors(model, filter_fn=None, **kwargs):
 
 
 @torch.no_grad
-def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
-    if kwargs is None:
-        kwargs = {}
-
+def _bench_quantized_tensor_subclass_perf(api, config, M, N, K):
     m = ToyLinearModel(
         M, N, K, has_bias=True, dtype=torch.bfloat16, device="cuda"
     ).eval()
     m_bf16 = copy.deepcopy(m)
-    m_ref = copy.deepcopy(m)
     example_inputs = m.example_inputs()
 
-    api(m, **kwargs)
-
-    # reference
-    ref_api(m_ref, **kwargs)
-
-    res = m(*example_inputs)
-    ref = m_ref(*example_inputs)
-
-    assert torch.equal(res, ref)
+    api(m, config)  # Pass both model and config
 
     # perf comparison
     from torchao.utils import benchmark_model
@@ -146,22 +114,17 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
     RUNS = 100
 
     torch._dynamo.reset()
-    m_ref = torch.compile(m_ref, mode="max-autotune", fullgraph=True)
-    benchmark_model(m_ref, WARMUP, example_inputs)
-    ref_elapsed_time = benchmark_model(m_ref, RUNS, example_inputs)
+    m_bf16 = torch.compile(m_bf16, mode="max-autotune", fullgraph=True)
+    benchmark_model(m_bf16, WARMUP, example_inputs)
+    bf16_elapsed_time = benchmark_model(m_bf16, RUNS, example_inputs)
 
     torch._dynamo.reset()
     m = torch.compile(m, mode="max-autotune", fullgraph=True)
     benchmark_model(m, WARMUP, example_inputs)
     elapsed_time = benchmark_model(m, RUNS, example_inputs)
 
-    torch._dynamo.reset()
-    m_bf16 = torch.compile(m_bf16, mode="max-autotune", fullgraph=True)
-    benchmark_model(m_bf16, WARMUP, example_inputs)
-    bf16_elapsed_time = benchmark_model(m_bf16, RUNS, example_inputs)
-
     print(
-        f"{(M, N, K)}: elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}, bf16 elapsed time: {bf16_elapsed_time}"
+        f"{(M, N, K)}: elapsed time: {elapsed_time}, bf16 elapsed time: {bf16_elapsed_time}"
     )
 
 
@@ -170,20 +133,32 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
         (20, 2048, 2048),
     ]
 
-    print("_int8da_int8w_api")
-
+    print("Int8DynamicActivationInt8WeightConfig")
     for M, N, K in all_shapes:
         _bench_quantized_tensor_subclass_perf(
-            _int8da_int8w_api, _int8da_int8w_api, M, N, K
+            quantize_,
+            Int8DynamicActivationInt8WeightConfig(),
+            M,
+            N,
+            K,
         )
 
-    print("_int8wo_api")
-
+    print("Int8WeightOnlyConfig")
     for M, N, K in all_shapes:
-        _bench_quantized_tensor_subclass_perf(_int8wo_api, _int8wo_api, M, N, K)
-
-    print("_int4wo_api")
-    kwargs = {"groupsize": 32, "version": 1}
+        _bench_quantized_tensor_subclass_perf(
+            quantize_,
+            Int8WeightOnlyConfig(),
+            M,
+            N,
+            K,
+        )
 
+    print("Int4WeightOnlyConfig")
     for M, N, K in all_shapes:
-        _bench_quantized_tensor_subclass_perf(_int4wo_api, _int4wo_api, M, N, K, kwargs)
+        _bench_quantized_tensor_subclass_perf(
+            quantize_,
+            Int4WeightOnlyConfig(group_size=32),
+            M,
+            N,
+            K,
+        )