Skip to content

Commit

Permalink
Use pytest to benchmark the repro function (#1523)
Browse files Browse the repository at this point in the history
  • Loading branch information
kiya00 authored Dec 9, 2024
1 parent f169db6 commit 21929d8
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 82 deletions.
55 changes: 55 additions & 0 deletions thunder/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from functools import partial
from numbers import Number
from typing import Any
from contextlib import contextmanager

import torch
import torch.multiprocessing as mp
Expand All @@ -34,6 +35,8 @@
from thunder.tests import nanogpt_model, hf_bart_self_attn
from thunder.tests.make_tensor import make_tensor, make_tensor_like

MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB"

# List of all benchmarks
benchmarks: list = []

Expand Down Expand Up @@ -3028,3 +3031,55 @@ def fn(self) -> Callable:
# if args.listbenchmarks:
# list_benchmarks(use_classname=False)
# sys.exit(0)


def timer_and_memory_stats(benchmark) -> float:
"""
Make a timer that also records the peak allocated memory.
pytest-benchmark has the following benchmarking code structure:
start = timer()
for _ in loops_range:
function_to_benchmark(*args, **kwargs)
end = timer()
So the information about the peak allocated memory should be recorded
after the function_to_benchmark call and we need to reset the peak memory
stats before the function_to_benchmark call.
If reset_peak_memory_stats is called inside the function_to_benchmark call,
the peak memory stats will be reset multiple times and the peak memory
stats may not be accurate.
Args:
benchmark: The pytest-benchmark object
Returns:
The decorator that records the peak allocated memory
"""

def deco(old_timer):
import functools

@functools.wraps(old_timer)
def timer():
ret = old_timer()
# Max allocated memory is recorded in MB
benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
torch.cuda.reset_peak_memory_stats()
return ret

return timer

return deco


@contextmanager
def record_peak_allocated_memory(benchmark):
old_timer = benchmark._timer
benchmark._timer = timer_and_memory_stats(benchmark)(benchmark._timer)
try:
yield
finally:
benchmark._timer = old_timer
57 changes: 1 addition & 56 deletions thunder/benchmarks/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
torch_compile_executor,
torch_executor,
thunder_transformerengine_executor,
record_peak_allocated_memory,
)
from thunder.core.interpreter import interpret

Expand All @@ -56,7 +57,6 @@
"phi-2",
]
RUN_ALL_CONFIGS = os.environ.get("THUNDER_BENCH_RUN_ALL_CONFIGS", "0") == "1"
MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB"


class ComputeType(Enum):
Expand All @@ -81,61 +81,6 @@ def is_requires_grad(type: ComputeType):
)


import functools


def timer_and_memory_stats(benchmark) -> float:
"""
Make a timer that also records the peak allocated memory.
pytest-benchmark has the following benchmarking code structure:
start = timer()
for _ in loops_range:
function_to_benchmark(*args, **kwargs)
end = timer()
So the information about the peak allocated memory should be recorded
after the function_to_benchmark call and we need to reset the peak memory
stats before the function_to_benchmark call.
If reset_peak_memory_stats is called inside the function_to_benchmark call,
the peak memory stats will be reset multiple times and the peak memory
stats may not be accurate.
Args:
benchmark: The pytest-benchmark object
Returns:
The decorator that records the peak allocated memory
"""

def deco(old_timer):
@functools.wraps(old_timer)
def timer():
ret = old_timer()
benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
torch.cuda.reset_peak_memory_stats()
return ret

return timer

return deco


from contextlib import contextmanager


@contextmanager
def record_peak_allocated_memory(benchmark):
old_timer = benchmark._timer
benchmark._timer = timer_and_memory_stats(benchmark)(benchmark._timer)
try:
yield
finally:
benchmark._timer = old_timer


def benchmark_for_compute_type(compute_type: ComputeType, benchmark, fn: Callable, args, kwargs):
with record_peak_allocated_memory(benchmark):
match compute_type:
Expand Down
2 changes: 1 addition & 1 deletion thunder/dynamo/compiler_graph_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _get_debug_options(self, **debug_options):
self.post_graph = debug_options.get("post_graph", False)

def run_bench(self, gm: torch.fx.GraphModule, name: str, *sample_args):
from thunder.benchmarks.targets import record_peak_allocated_memory, MAX_ALLOCATED_MEMORY_KEYWORD
from thunder.benchmarks import record_peak_allocated_memory, MAX_ALLOCATED_MEMORY_KEYWORD

for ex_name, ex in self.executors.items():
if ex is None:
Expand Down
68 changes: 43 additions & 25 deletions thunder/dynamo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,6 @@ def _readable(
verbose=False,
include_stride=include_stride,
include_device=include_device,
colored=colored,
)
module_code = verbose_python_code.src
module_code = module_code.lstrip("\n")
Expand Down Expand Up @@ -770,43 +769,62 @@ def reproducer(
del split_reason_str
if use_pytest_benchmark:
comment_str += f"""# NOTE: This script requires `pytest-benchmark==4.0.0` to be installed.
# To execute the script, run `pytest {graph_name}.py`"""
import_str = f"from functools import partial\n\nimport torch\nimport thunder\n"
# To execute the script, run `pytest {graph_name}.py --benchmark-timer=torch.utils.benchmark.utils.timer.timer --benchmark-warmup=on`
# To check the peak allocated CUDA memory, use --benchmark-json=json_file_name and look at the "max_allocated_memory_MB" field in the json file"""
# The packages that are likely to be used by the code generated from the Torch GraphModule
code_str = "\n".join([v.import_str for v in torch.fx.graph._custom_builtins.values()])
code_str += f"\nfrom functools import partial\nimport thunder\n"
if has_cuda_args:
import_str += "from thunder.transforms.cudagraph import CUDAGraphTransform\n"
import_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n"
code_str += "from thunder.transforms.cudagraph import CUDAGraphTransform\n"
code_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n"
if use_pytest_benchmark:
code_str = f"def test_{graph_name}(benchmark):\n{readable}\n"
else:
code_str = f"def test_{graph_name}():\n{readable}\n"

if any(arg is None for arg in args):
code_str += f"# Warning: The inputs that cannot be inferred are set to None, requiring the user to manually give inputs according to the code\n"
input_str = f"""inputs = [\n{chr(10).join(arg_like(a) for a in args)}\n"""
code_str += f"{_addindent(input_str, 4)}\n]\n"
code_str += f"""import pytest
if not use_pytest_benchmark:
code_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n"
code_str += "compiled(*inputs)"
else:
code_str += "from thunder.dynamo.compiler_graph_benchmark import ThunderCompilerGraphBenchmarking\n"
code_str = f"""{code_str}
bench_executors_dict = {{}}
bench_executors_dict["thunder"]=partial(thunder.jit, {thunder_options_str})
bench_executors_dict["torch.compile"]=torch.compile
bench_executors_dict["dynamo_eager"]=partial(torch.compile, backend="eager")
bench_executors_dict["eager"]=None
"""
if has_cuda_args:
code_str = f"""{code_str}bench_executors_dict["thunder_cugraph"]=partial(thunder.jit, transform=CUDAGraphTransform())\n"""
code_str += f"""bench_executors_dict["thunder_cugraph"]=partial(thunder.jit, transform=CUDAGraphTransform())\n"""
code_str += f"""
backend = ThunderCompilerGraphBenchmarking(benchmark, executors=bench_executors_dict)
compiled = torch.compile(backend=backend)(DynamoModule())
compiled(*inputs)
executors = list(bench_executors_dict.values())
executor_ids = list(bench_executors_dict.keys())
@pytest.mark.parametrize(
"executor,",
executors,
ids=executor_ids,
)"""
func_str = f"def test_{graph_name}(benchmark, executor):\n{readable}\n"
else:
func_str = f"def test_{graph_name}():\n{readable}\n"

if any(arg is None for arg in args):
func_str += f"# Warning: The inputs that cannot be inferred are set to None, requiring the user to manually give inputs according to the code\n"
input_str = f"""inputs = [\n{chr(10).join(arg_like(a) for a in args)}\n"""
func_str += f"{_addindent(input_str, 4)}\n]\n"

if not use_pytest_benchmark:
func_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n"
func_str += "compiled(*inputs)"
else:
func_str = f"""{func_str}
mod = DynamoModule()
compiled = mod if executor == None else executor(mod)
"""
if not has_cuda_args:
func_str += f"""benchmark(compiled, *inputs)"""
else:
func_str += f"""from thunder.benchmarks import record_peak_allocated_memory
with record_peak_allocated_memory(benchmark):
benchmark(compiled, *inputs)
"""
print(comment_str, file=f)
print(import_str, file=f)
print(_addindent(code_str, 4), file=f)
print(code_str, file=f)
print(_addindent(func_str, 4), file=f)

if not use_pytest_benchmark:
print(f"\ntest_{graph_name}()", file=f)
10 changes: 10 additions & 0 deletions thunder/tests/test_dynamo.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,11 @@ def find_target_module(model, target_module_name):
decorators=(pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")),),
)
def test_dynamo_reproducer_2graph(executor, device: str, dtype: dtypes.dtype, use_pytest_benchmark, tmp_path):
if IS_WINDOWS and use_pytest_benchmark:
pytest.skip(
"Skipping on Windows because this uses torch.compile (see https://github.com/Lightning-AI/lightning-thunder/issues/1326)"
)

from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform
from thunder import nvfuser_executor
from thunder.transforms.cudagraph import CUDAGraphTransform
Expand Down Expand Up @@ -893,6 +898,11 @@ def forward(self, x):
decorators=(pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")),),
)
def test_dynamo_reproducer_split(executor, device: str, dtype: dtypes.dtype, use_pytest_benchmark, tmp_path):
if IS_WINDOWS and use_pytest_benchmark:
pytest.skip(
"Skipping on Windows because this uses torch.compile (see https://github.com/Lightning-AI/lightning-thunder/issues/1326)"
)

x = torch.ones(2, 2, device=device, dtype=dtype, requires_grad=True)

backend = ThunderCompiler()
Expand Down

0 comments on commit 21929d8

Please sign in to comment.