diff --git a/thunder/benchmarks/__init__.py b/thunder/benchmarks/__init__.py index 4ee493be5e..e4e3ca232e 100644 --- a/thunder/benchmarks/__init__.py +++ b/thunder/benchmarks/__init__.py @@ -10,6 +10,7 @@ from functools import partial from numbers import Number from typing import Any +from contextlib import contextmanager import torch import torch.multiprocessing as mp @@ -34,6 +35,8 @@ from thunder.tests import nanogpt_model, hf_bart_self_attn from thunder.tests.make_tensor import make_tensor, make_tensor_like +MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB" + # List of all benchmarks benchmarks: list = [] @@ -3028,3 +3031,55 @@ def fn(self) -> Callable: # if args.listbenchmarks: # list_benchmarks(use_classname=False) # sys.exit(0) + + +def timer_and_memory_stats(benchmark) -> float: + """ + Make a timer that also records the peak allocated memory. + + pytest-benchmark has the following benchmarking code structure: + + start = timer() + for _ in loops_range: + function_to_benchmark(*args, **kwargs) + end = timer() + + So the information about the peak allocated memory should be recorded + after the function_to_benchmark call and we need to reset the peak memory + stats before the function_to_benchmark call. + + If reset_peak_memory_stats is called inside the function_to_benchmark call, + the peak memory stats will be reset multiple times and the peak memory + stats may not be accurate. + + Args: + benchmark: The pytest-benchmark object + + Returns: + The decorator that records the peak allocated memory + """ + + def deco(old_timer): + import functools + + @functools.wraps(old_timer) + def timer(): + ret = old_timer() + # Max allocated memory is recorded in MB + benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0) + torch.cuda.reset_peak_memory_stats() + return ret + + return timer + + return deco + + +@contextmanager +def record_peak_allocated_memory(benchmark): + old_timer = benchmark._timer + benchmark._timer = timer_and_memory_stats(benchmark)(benchmark._timer) + try: + yield + finally: + benchmark._timer = old_timer diff --git a/thunder/benchmarks/targets.py b/thunder/benchmarks/targets.py index 36f09ff060..dd2e22e4fb 100644 --- a/thunder/benchmarks/targets.py +++ b/thunder/benchmarks/targets.py @@ -38,6 +38,7 @@ torch_compile_executor, torch_executor, thunder_transformerengine_executor, + record_peak_allocated_memory, ) from thunder.core.interpreter import interpret @@ -56,7 +57,6 @@ "phi-2", ] RUN_ALL_CONFIGS = os.environ.get("THUNDER_BENCH_RUN_ALL_CONFIGS", "0") == "1" -MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB" class ComputeType(Enum): @@ -81,61 +81,6 @@ def is_requires_grad(type: ComputeType): ) -import functools - - -def timer_and_memory_stats(benchmark) -> float: - """ - Make a timer that also records the peak allocated memory. - - pytest-benchmark has the following benchmarking code structure: - - start = timer() - for _ in loops_range: - function_to_benchmark(*args, **kwargs) - end = timer() - - So the information about the peak allocated memory should be recorded - after the function_to_benchmark call and we need to reset the peak memory - stats before the function_to_benchmark call. - - If reset_peak_memory_stats is called inside the function_to_benchmark call, - the peak memory stats will be reset multiple times and the peak memory - stats may not be accurate. - - Args: - benchmark: The pytest-benchmark object - - Returns: - The decorator that records the peak allocated memory - """ - - def deco(old_timer): - @functools.wraps(old_timer) - def timer(): - ret = old_timer() - benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0) - torch.cuda.reset_peak_memory_stats() - return ret - - return timer - - return deco - - -from contextlib import contextmanager - - -@contextmanager -def record_peak_allocated_memory(benchmark): - old_timer = benchmark._timer - benchmark._timer = timer_and_memory_stats(benchmark)(benchmark._timer) - try: - yield - finally: - benchmark._timer = old_timer - - def benchmark_for_compute_type(compute_type: ComputeType, benchmark, fn: Callable, args, kwargs): with record_peak_allocated_memory(benchmark): match compute_type: diff --git a/thunder/dynamo/compiler_graph_benchmark.py b/thunder/dynamo/compiler_graph_benchmark.py index 5dc6eecd37..194d62abcf 100644 --- a/thunder/dynamo/compiler_graph_benchmark.py +++ b/thunder/dynamo/compiler_graph_benchmark.py @@ -90,7 +90,7 @@ def _get_debug_options(self, **debug_options): self.post_graph = debug_options.get("post_graph", False) def run_bench(self, gm: torch.fx.GraphModule, name: str, *sample_args): - from thunder.benchmarks.targets import record_peak_allocated_memory, MAX_ALLOCATED_MEMORY_KEYWORD + from thunder.benchmarks import record_peak_allocated_memory, MAX_ALLOCATED_MEMORY_KEYWORD for ex_name, ex in self.executors.items(): if ex is None: diff --git a/thunder/dynamo/utils.py b/thunder/dynamo/utils.py index 7dbcabb7d1..8376cb2c7d 100644 --- a/thunder/dynamo/utils.py +++ b/thunder/dynamo/utils.py @@ -671,7 +671,6 @@ def _readable( verbose=False, include_stride=include_stride, include_device=include_device, - colored=colored, ) module_code = verbose_python_code.src module_code = module_code.lstrip("\n") @@ -770,27 +769,17 @@ def reproducer( del split_reason_str if use_pytest_benchmark: comment_str += f"""# NOTE: This script requires `pytest-benchmark==4.0.0` to be installed. -# To execute the script, run `pytest {graph_name}.py`""" - import_str = f"from functools import partial\n\nimport torch\nimport thunder\n" +# To execute the script, run `pytest {graph_name}.py --benchmark-timer=torch.utils.benchmark.utils.timer.timer --benchmark-warmup=on` +# To check the peak allocated CUDA memory, use --benchmark-json=json_file_name and look at the "max_allocated_memory_MB" field in the json file""" + # The packages that are likely to be used by the code generated from the Torch GraphModule + code_str = "\n".join([v.import_str for v in torch.fx.graph._custom_builtins.values()]) + code_str += f"\nfrom functools import partial\nimport thunder\n" if has_cuda_args: - import_str += "from thunder.transforms.cudagraph import CUDAGraphTransform\n" - import_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n" + code_str += "from thunder.transforms.cudagraph import CUDAGraphTransform\n" + code_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n" if use_pytest_benchmark: - code_str = f"def test_{graph_name}(benchmark):\n{readable}\n" - else: - code_str = f"def test_{graph_name}():\n{readable}\n" - - if any(arg is None for arg in args): - code_str += f"# Warning: The inputs that cannot be inferred are set to None, requiring the user to manually give inputs according to the code\n" - input_str = f"""inputs = [\n{chr(10).join(arg_like(a) for a in args)}\n""" - code_str += f"{_addindent(input_str, 4)}\n]\n" + code_str += f"""import pytest - if not use_pytest_benchmark: - code_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n" - code_str += "compiled(*inputs)" - else: - code_str += "from thunder.dynamo.compiler_graph_benchmark import ThunderCompilerGraphBenchmarking\n" - code_str = f"""{code_str} bench_executors_dict = {{}} bench_executors_dict["thunder"]=partial(thunder.jit, {thunder_options_str}) bench_executors_dict["torch.compile"]=torch.compile @@ -798,15 +787,44 @@ def reproducer( bench_executors_dict["eager"]=None """ if has_cuda_args: - code_str = f"""{code_str}bench_executors_dict["thunder_cugraph"]=partial(thunder.jit, transform=CUDAGraphTransform())\n""" + code_str += f"""bench_executors_dict["thunder_cugraph"]=partial(thunder.jit, transform=CUDAGraphTransform())\n""" code_str += f""" -backend = ThunderCompilerGraphBenchmarking(benchmark, executors=bench_executors_dict) -compiled = torch.compile(backend=backend)(DynamoModule()) -compiled(*inputs) +executors = list(bench_executors_dict.values()) +executor_ids = list(bench_executors_dict.keys()) + +@pytest.mark.parametrize( + "executor,", + executors, + ids=executor_ids, +)""" + func_str = f"def test_{graph_name}(benchmark, executor):\n{readable}\n" + else: + func_str = f"def test_{graph_name}():\n{readable}\n" + + if any(arg is None for arg in args): + func_str += f"# Warning: The inputs that cannot be inferred are set to None, requiring the user to manually give inputs according to the code\n" + input_str = f"""inputs = [\n{chr(10).join(arg_like(a) for a in args)}\n""" + func_str += f"{_addindent(input_str, 4)}\n]\n" + + if not use_pytest_benchmark: + func_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n" + func_str += "compiled(*inputs)" + else: + func_str = f"""{func_str} +mod = DynamoModule() +compiled = mod if executor == None else executor(mod) +""" + if not has_cuda_args: + func_str += f"""benchmark(compiled, *inputs)""" + else: + func_str += f"""from thunder.benchmarks import record_peak_allocated_memory + +with record_peak_allocated_memory(benchmark): + benchmark(compiled, *inputs) """ print(comment_str, file=f) - print(import_str, file=f) - print(_addindent(code_str, 4), file=f) + print(code_str, file=f) + print(_addindent(func_str, 4), file=f) if not use_pytest_benchmark: print(f"\ntest_{graph_name}()", file=f) diff --git a/thunder/tests/test_dynamo.py b/thunder/tests/test_dynamo.py index 3bc88e8a9f..0ae470b1fc 100644 --- a/thunder/tests/test_dynamo.py +++ b/thunder/tests/test_dynamo.py @@ -788,6 +788,11 @@ def find_target_module(model, target_module_name): decorators=(pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")),), ) def test_dynamo_reproducer_2graph(executor, device: str, dtype: dtypes.dtype, use_pytest_benchmark, tmp_path): + if IS_WINDOWS and use_pytest_benchmark: + pytest.skip( + "Skipping on Windows because this uses torch.compile (see https://github.com/Lightning-AI/lightning-thunder/issues/1326)" + ) + from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform from thunder import nvfuser_executor from thunder.transforms.cudagraph import CUDAGraphTransform @@ -893,6 +898,11 @@ def forward(self, x): decorators=(pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")),), ) def test_dynamo_reproducer_split(executor, device: str, dtype: dtypes.dtype, use_pytest_benchmark, tmp_path): + if IS_WINDOWS and use_pytest_benchmark: + pytest.skip( + "Skipping on Windows because this uses torch.compile (see https://github.com/Lightning-AI/lightning-thunder/issues/1326)" + ) + x = torch.ones(2, 2, device=device, dtype=dtype, requires_grad=True) backend = ThunderCompiler()