pytorch · jainapurva · Jul 10, 2024 · Jun 20, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -83,6 +83,7 @@
     TORCH_VERSION_AFTER_2_4,
     unwrap_tensor_subclass,
     is_fbcode,
+    benchmark_model
 )
 
 logger = logging.getLogger("INFO")
@@ -1487,5 +1488,39 @@ def test_get_model_size_aqt(self, api, test_device, test_dtype):
 
 
 
+class TestBenchmarkModel(unittest.TestCase):
+
+    class ToyLinearModel(torch.nn.Module):
+        def __init__(self, m=64, n=32, k=64):
+            super().__init__()
+            self.linear1 = torch.nn.Linear(m, n, bias=False)
+            self.linear2 = torch.nn.Linear(n, k, bias=False)
+
+        def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
+            return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),)
+
+        def forward(self, x):
+            x = self.linear1(x)
+            x = self.linear2(x)
+            return x
+
+    def run_benchmark_model(self, device):
+        # params
+        dtype = torch.bfloat16
+        m = self.ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to(device)
+        m_bf16 = copy.deepcopy(m)
+        example_inputs = m.example_inputs(dtype=dtype, device=device)
+        m_bf16 = torch.compile(m_bf16, mode='max-autotune')
+        num_runs = 1
+        return benchmark_model(m_bf16, num_runs, example_inputs[0])
+
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    def test_benchmark_model_cuda(self):
+        assert self.run_benchmark_model("cuda") is not None
+
+    def test_benchmark_model_cpu(self):
+        assert self.run_benchmark_model("cpu") is not None
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -1,10 +1,12 @@
 import torch
-from typing import Tuple
+from typing import Tuple, Any
 from functools import reduce
 from importlib.metadata import version
 from math import gcd
 import torch.nn.utils.parametrize as parametrize
 import itertools
+import time
+import warnings
 
 __all__ = [
     "benchmark_model",
@@ -22,20 +24,72 @@
 ]
 
 
+# Referenced from: https://github.com/pytorch/pytorch/blob/9105d54c6b37099575c0059ef274c86c4dc80c57/torch/ao/quantization/utils.py#L711
+def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
+    """
+    Returns the unique device for a module, or None if no device is found.
+    Throws an error if multiple devices are detected.
+    """
+    devices = {p.device for p in module.parameters()} | \
+        {p.device for p in module.buffers()}
+
+    if {torch.device("cpu"), torch.device("meta")} == devices:
+        warnings.warn("Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.")
+        devices = {torch.device("cpu")}
+    ""
+    assert len(devices) <= 1, (
+        "prepare only works with cpu or single-device CUDA modules, "
+        f"but got devices {devices}"
+    )
+    device = next(iter(devices)) if len(devices) > 0 else None
+    return device
+
+
 def benchmark_model(model, num_runs, input_tensor):
-    torch.cuda.synchronize()
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-
-    # benchmark
-    for _ in range(num_runs):
-        with torch.autograd.profiler.record_function("timed region"):
-            model(input_tensor)
-
-    end_event.record()
-    torch.cuda.synchronize()
-    return start_event.elapsed_time(end_event) / num_runs
+    if _assert_and_get_unique_device(model).type == "cuda":
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+
+        # benchmark
+        for _ in range(num_runs):
+            with torch.autograd.profiler.record_function("timed region"):
+                model(input_tensor)
+
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event) / num_runs
+
+    elif _assert_and_get_unique_device(model).type == "mps":
+        torch.mps.synchronize()
+        start_event = torch.mps.event.Event(enable_timing=True)
+        end_event = torch.mps.event.Event(enable_timing=True)
+        start_event.record()
+
+        # benchmark
+        for _ in range(num_runs):
+            with torch.autograd.profiler.record_function("timed region"):
+                model(input_tensor)
+
+        end_event.record()
+        torch.mps.synchronize()
+        return start_event.elapsed_time(end_event) / num_runs
+
+    elif _assert_and_get_unique_device(model).type == "cpu":
+        torch.cpu.synchronize()
+        start_time = time.time()
+
+        # benchmark
+        for _ in range(num_runs):
+            with torch.autograd.profiler.record_function("timed region"):
+                model(input_tensor)
+
+        end_time = time.time()
+        torch.cpu.synchronize()
+        average_time_per_run = (end_time - start_time) / num_runs
+        return average_time_per_run
+
 
 def profiler_runner(path, fn, *args, **kwargs):
     with torch.profiler.profile(