Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support to benchmark_model for cpu and mps #406

Merged
merged 10 commits into from
Jul 10, 2024
35 changes: 35 additions & 0 deletions test/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
TORCH_VERSION_AFTER_2_4,
unwrap_tensor_subclass,
is_fbcode,
benchmark_model
)

logger = logging.getLogger("INFO")
Expand Down Expand Up @@ -1487,5 +1488,39 @@ def test_get_model_size_aqt(self, api, test_device, test_dtype):



class TestBenchmarkModel(unittest.TestCase):

class ToyLinearModel(torch.nn.Module):
def __init__(self, m=64, n=32, k=64):
super().__init__()
self.linear1 = torch.nn.Linear(m, n, bias=False)
self.linear2 = torch.nn.Linear(n, k, bias=False)

def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),)

def forward(self, x):
x = self.linear1(x)
x = self.linear2(x)
return x

def run_benchmark_model(self, device):
# params
dtype = torch.bfloat16
m = self.ToyLinearModel(1024, 1024, 1024).eval().to(dtype).to(device)
m_bf16 = copy.deepcopy(m)
example_inputs = m.example_inputs(dtype=dtype, device=device)
m_bf16 = torch.compile(m_bf16, mode='max-autotune')
jainapurva marked this conversation as resolved.
Show resolved Hide resolved
num_runs = 1
return benchmark_model(m_bf16, num_runs, example_inputs[0])

@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
def test_benchmark_model_cuda(self):
assert self.run_benchmark_model("cuda") is not None

def test_benchmark_model_cpu(self):
assert self.run_benchmark_model("cpu") is not None


if __name__ == "__main__":
unittest.main()
82 changes: 68 additions & 14 deletions torchao/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import torch
from typing import Tuple
from typing import Tuple, Any
from functools import reduce
from importlib.metadata import version
from math import gcd
import torch.nn.utils.parametrize as parametrize
import itertools
import time
import warnings

__all__ = [
"benchmark_model",
Expand All @@ -22,20 +24,72 @@
]


# Referenced from: https://github.com/pytorch/pytorch/blob/9105d54c6b37099575c0059ef274c86c4dc80c57/torch/ao/quantization/utils.py#L711
def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
jainapurva marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns the unique device for a module, or None if no device is found.
Throws an error if multiple devices are detected.
"""
devices = {p.device for p in module.parameters()} | \
{p.device for p in module.buffers()}

if {torch.device("cpu"), torch.device("meta")} == devices:
warnings.warn("Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.")
devices = {torch.device("cpu")}
jainapurva marked this conversation as resolved.
Show resolved Hide resolved
""
jainapurva marked this conversation as resolved.
Show resolved Hide resolved
assert len(devices) <= 1, (
"prepare only works with cpu or single-device CUDA modules, "
f"but got devices {devices}"
)
device = next(iter(devices)) if len(devices) > 0 else None
return device


def benchmark_model(model, num_runs, input_tensor):
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

# benchmark
for _ in range(num_runs):
with torch.autograd.profiler.record_function("timed region"):
model(input_tensor)

end_event.record()
torch.cuda.synchronize()
return start_event.elapsed_time(end_event) / num_runs
if _assert_and_get_unique_device(model).type == "cuda":
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
jainapurva marked this conversation as resolved.
Show resolved Hide resolved
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

# benchmark
for _ in range(num_runs):
with torch.autograd.profiler.record_function("timed region"):
model(input_tensor)

end_event.record()
torch.cuda.synchronize()
return start_event.elapsed_time(end_event) / num_runs

elif _assert_and_get_unique_device(model).type == "mps":
jainapurva marked this conversation as resolved.
Show resolved Hide resolved
torch.mps.synchronize()
start_event = torch.mps.event.Event(enable_timing=True)
end_event = torch.mps.event.Event(enable_timing=True)
start_event.record()

# benchmark
for _ in range(num_runs):
with torch.autograd.profiler.record_function("timed region"):
model(input_tensor)

end_event.record()
torch.mps.synchronize()
return start_event.elapsed_time(end_event) / num_runs

elif _assert_and_get_unique_device(model).type == "cpu":
torch.cpu.synchronize()
start_time = time.time()

# benchmark
for _ in range(num_runs):
with torch.autograd.profiler.record_function("timed region"):
model(input_tensor)

end_time = time.time()
torch.cpu.synchronize()
average_time_per_run = (end_time - start_time) / num_runs
return average_time_per_run


def profiler_runner(path, fn, *args, **kwargs):
with torch.profiler.profile(
Expand Down
Loading