Use pytest to benchmark the repro function (#1523)

Lightning-AI · Dec 9, 2024 · 21929d8 · 21929d8
1 parent f169db6
commit 21929d8
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 82 deletions.
diff --git a/thunder/benchmarks/__init__.py b/thunder/benchmarks/__init__.py
@@ -10,6 +10,7 @@
 from functools import partial
 from numbers import Number
 from typing import Any
+from contextlib import contextmanager
 
 import torch
 import torch.multiprocessing as mp
@@ -34,6 +35,8 @@
 from thunder.tests import nanogpt_model, hf_bart_self_attn
 from thunder.tests.make_tensor import make_tensor, make_tensor_like
 
+MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB"
+
 # List of all benchmarks
 benchmarks: list = []
 
@@ -3028,3 +3031,55 @@ def fn(self) -> Callable:
 #     if args.listbenchmarks:
 #         list_benchmarks(use_classname=False)
 #         sys.exit(0)
+
+
+def timer_and_memory_stats(benchmark) -> float:
+    """
+    Make a timer that also records the peak allocated memory.
+
+    pytest-benchmark has the following benchmarking code structure:
+
+    start = timer()
+    for _ in loops_range:
+        function_to_benchmark(*args, **kwargs)
+    end = timer()
+
+    So the information about the peak allocated memory should be recorded
+    after the function_to_benchmark call and we need to reset the peak memory
+    stats before the function_to_benchmark call.
+
+    If reset_peak_memory_stats is called inside the function_to_benchmark call,
+    the peak memory stats will be reset multiple times and the peak memory
+    stats may not be accurate.
+
+    Args:
+        benchmark: The pytest-benchmark object
+
+    Returns:
+        The decorator that records the peak allocated memory
+    """
+
+    def deco(old_timer):
+        import functools
+
+        @functools.wraps(old_timer)
+        def timer():
+            ret = old_timer()
+            # Max allocated memory is recorded in MB
+            benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
+            torch.cuda.reset_peak_memory_stats()
+            return ret
+
+        return timer
+
+    return deco
+
+
+@contextmanager
+def record_peak_allocated_memory(benchmark):
+    old_timer = benchmark._timer
+    benchmark._timer = timer_and_memory_stats(benchmark)(benchmark._timer)
+    try:
+        yield
+    finally:
+        benchmark._timer = old_timer
diff --git a/thunder/benchmarks/targets.py b/thunder/benchmarks/targets.py
@@ -38,6 +38,7 @@
     torch_compile_executor,
     torch_executor,
     thunder_transformerengine_executor,
+    record_peak_allocated_memory,
 )
 from thunder.core.interpreter import interpret
 
@@ -56,7 +57,6 @@
     "phi-2",
 ]
 RUN_ALL_CONFIGS = os.environ.get("THUNDER_BENCH_RUN_ALL_CONFIGS", "0") == "1"
-MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB"
 
 
 class ComputeType(Enum):
@@ -81,61 +81,6 @@ def is_requires_grad(type: ComputeType):
 )
 
 
-import functools
-
-
-def timer_and_memory_stats(benchmark) -> float:
-    """
-    Make a timer that also records the peak allocated memory.
-
-    pytest-benchmark has the following benchmarking code structure:
-
-    start = timer()
-    for _ in loops_range:
-        function_to_benchmark(*args, **kwargs)
-    end = timer()
-
-    So the information about the peak allocated memory should be recorded
-    after the function_to_benchmark call and we need to reset the peak memory
-    stats before the function_to_benchmark call.
-
-    If reset_peak_memory_stats is called inside the function_to_benchmark call,
-    the peak memory stats will be reset multiple times and the peak memory
-    stats may not be accurate.
-
-    Args:
-        benchmark: The pytest-benchmark object
-
-    Returns:
-        The decorator that records the peak allocated memory
-    """
-
-    def deco(old_timer):
-        @functools.wraps(old_timer)
-        def timer():
-            ret = old_timer()
-            benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
-            torch.cuda.reset_peak_memory_stats()
-            return ret
-
-        return timer
-
-    return deco
-
-
-from contextlib import contextmanager
-
-
-@contextmanager
-def record_peak_allocated_memory(benchmark):
-    old_timer = benchmark._timer
-    benchmark._timer = timer_and_memory_stats(benchmark)(benchmark._timer)
-    try:
-        yield
-    finally:
-        benchmark._timer = old_timer
-
-
 def benchmark_for_compute_type(compute_type: ComputeType, benchmark, fn: Callable, args, kwargs):
     with record_peak_allocated_memory(benchmark):
         match compute_type:

diff --git a/thunder/dynamo/compiler_graph_benchmark.py b/thunder/dynamo/compiler_graph_benchmark.py
@@ -90,7 +90,7 @@ def _get_debug_options(self, **debug_options):
         self.post_graph = debug_options.get("post_graph", False)
 
     def run_bench(self, gm: torch.fx.GraphModule, name: str, *sample_args):
-        from thunder.benchmarks.targets import record_peak_allocated_memory, MAX_ALLOCATED_MEMORY_KEYWORD
+        from thunder.benchmarks import record_peak_allocated_memory, MAX_ALLOCATED_MEMORY_KEYWORD
 
         for ex_name, ex in self.executors.items():
             if ex is None:

diff --git a/thunder/dynamo/utils.py b/thunder/dynamo/utils.py
@@ -671,7 +671,6 @@ def _readable(
         verbose=False,
         include_stride=include_stride,
         include_device=include_device,
-        colored=colored,
     )
     module_code = verbose_python_code.src
     module_code = module_code.lstrip("\n")
@@ -770,43 +769,62 @@ def reproducer(
         del split_reason_str
         if use_pytest_benchmark:
             comment_str += f"""# NOTE: This script requires `pytest-benchmark==4.0.0` to be installed.
-# To execute the script, run `pytest {graph_name}.py`"""
-        import_str = f"from functools import partial\n\nimport torch\nimport thunder\n"
+# To execute the script, run `pytest {graph_name}.py --benchmark-timer=torch.utils.benchmark.utils.timer.timer --benchmark-warmup=on`
+# To check the peak allocated CUDA memory, use --benchmark-json=json_file_name and look at the "max_allocated_memory_MB" field in the json file"""
+        # The packages that are likely to be used by the code generated from the Torch GraphModule
+        code_str = "\n".join([v.import_str for v in torch.fx.graph._custom_builtins.values()])
+        code_str += f"\nfrom functools import partial\nimport thunder\n"
         if has_cuda_args:
-            import_str += "from thunder.transforms.cudagraph import CUDAGraphTransform\n"
-            import_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n"
+            code_str += "from thunder.transforms.cudagraph import CUDAGraphTransform\n"
+            code_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n"
         if use_pytest_benchmark:
-            code_str = f"def test_{graph_name}(benchmark):\n{readable}\n"
-        else:
-            code_str = f"def test_{graph_name}():\n{readable}\n"
-
-        if any(arg is None for arg in args):
-            code_str += f"# Warning: The inputs that cannot be inferred are set to None, requiring the user to manually give inputs according to the code\n"
-        input_str = f"""inputs = [\n{chr(10).join(arg_like(a) for a in args)}\n"""
-        code_str += f"{_addindent(input_str, 4)}\n]\n"
+            code_str += f"""import pytest
 
-        if not use_pytest_benchmark:
-            code_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n"
-            code_str += "compiled(*inputs)"
-        else:
-            code_str += "from thunder.dynamo.compiler_graph_benchmark import ThunderCompilerGraphBenchmarking\n"
-            code_str = f"""{code_str}
 bench_executors_dict = {{}}
 bench_executors_dict["thunder"]=partial(thunder.jit, {thunder_options_str})
 bench_executors_dict["torch.compile"]=torch.compile
 bench_executors_dict["dynamo_eager"]=partial(torch.compile, backend="eager")
 bench_executors_dict["eager"]=None
 """
             if has_cuda_args:
-                code_str = f"""{code_str}bench_executors_dict["thunder_cugraph"]=partial(thunder.jit, transform=CUDAGraphTransform())\n"""
+                code_str += f"""bench_executors_dict["thunder_cugraph"]=partial(thunder.jit, transform=CUDAGraphTransform())\n"""
             code_str += f"""
-backend = ThunderCompilerGraphBenchmarking(benchmark, executors=bench_executors_dict)
-compiled = torch.compile(backend=backend)(DynamoModule())
-compiled(*inputs)
+executors = list(bench_executors_dict.values())
+executor_ids = list(bench_executors_dict.keys())
+
+@pytest.mark.parametrize(
+    "executor,",
+    executors,
+    ids=executor_ids,
+)"""
+            func_str = f"def test_{graph_name}(benchmark, executor):\n{readable}\n"
+        else:
+            func_str = f"def test_{graph_name}():\n{readable}\n"
+
+        if any(arg is None for arg in args):
+            func_str += f"# Warning: The inputs that cannot be inferred are set to None, requiring the user to manually give inputs according to the code\n"
+        input_str = f"""inputs = [\n{chr(10).join(arg_like(a) for a in args)}\n"""
+        func_str += f"{_addindent(input_str, 4)}\n]\n"
+
+        if not use_pytest_benchmark:
+            func_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n"
+            func_str += "compiled(*inputs)"
+        else:
+            func_str = f"""{func_str}
+mod = DynamoModule()
+compiled = mod if executor == None else executor(mod)
+"""
+            if not has_cuda_args:
+                func_str += f"""benchmark(compiled, *inputs)"""
+            else:
+                func_str += f"""from thunder.benchmarks import record_peak_allocated_memory
+
+with record_peak_allocated_memory(benchmark):
+    benchmark(compiled, *inputs)
 """
         print(comment_str, file=f)
-        print(import_str, file=f)
-        print(_addindent(code_str, 4), file=f)
+        print(code_str, file=f)
+        print(_addindent(func_str, 4), file=f)
 
         if not use_pytest_benchmark:
             print(f"\ntest_{graph_name}()", file=f)
diff --git a/thunder/tests/test_dynamo.py b/thunder/tests/test_dynamo.py
@@ -788,6 +788,11 @@ def find_target_module(model, target_module_name):
     decorators=(pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")),),
 )
 def test_dynamo_reproducer_2graph(executor, device: str, dtype: dtypes.dtype, use_pytest_benchmark, tmp_path):
+    if IS_WINDOWS and use_pytest_benchmark:
+        pytest.skip(
+            "Skipping on Windows because this uses torch.compile (see https://github.com/Lightning-AI/lightning-thunder/issues/1326)"
+        )
+
     from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform
     from thunder import nvfuser_executor
     from thunder.transforms.cudagraph import CUDAGraphTransform
@@ -893,6 +898,11 @@ def forward(self, x):
     decorators=(pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")),),
 )
 def test_dynamo_reproducer_split(executor, device: str, dtype: dtypes.dtype, use_pytest_benchmark, tmp_path):
+    if IS_WINDOWS and use_pytest_benchmark:
+        pytest.skip(
+            "Skipping on Windows because this uses torch.compile (see https://github.com/Lightning-AI/lightning-thunder/issues/1326)"
+        )
+
     x = torch.ones(2, 2, device=device, dtype=dtype, requires_grad=True)
 
     backend = ThunderCompiler()