From 643fa3471b186d0395ca9c4881a623a1e1b8b36a Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Tue, 9 Jul 2024 21:07:47 -0700 Subject: [PATCH] Add ncu_rep_ir metric for TTGIR source attribution Summary: Allows us to conveniently generate both a ttgir-source-attributed profile in addition to a regular Triton source profile. Reviewed By: xuzhao9 Differential Revision: D59327836 fbshipit-source-id: 1ad07ac6f2e1f66d73c7700cc84f21da0654d4aa --- torchbenchmark/util/triton_op.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py index fa88e03b8..8139bbff3 100644 --- a/torchbenchmark/util/triton_op.py +++ b/torchbenchmark/util/triton_op.py @@ -184,6 +184,8 @@ class BenchmarkOperatorMetrics: ncu_trace: Optional[str] = None # ncu replay file ncu_rep: Optional[str] = None + # ncu replay file with TTGIR line numbers + ncu_rep_ir: Optional[str] = None # kineto trace file kineto_trace: Optional[str] = None # cpu peak memory @@ -803,6 +805,8 @@ def _init_extra_metrics() -> Dict[str, Any]: metrics.ncu_trace = self.ncu_trace(input_id, fn_name) if "ncu_rep" in self.required_metrics: metrics.ncu_rep = self.ncu_trace(input_id, fn_name, replay=True) + if "ncu_rep_ir" in self.required_metrics: + metrics.ncu_rep_ir = self.ncu_trace(input_id, fn_name, replay=True, profile_ir=True) if "kineto_trace" in self.required_metrics: metrics.kineto_trace = self.kineto_trace(input_id, fn) if "best_config" in self.required_metrics: @@ -866,7 +870,7 @@ def get_peak_mem( metrics_gpu_backend="nvml", ) - def ncu_trace(self, input_id: int, fn_name: str, replay: bool=False) -> str: + def ncu_trace(self, input_id: int, fn_name: str, replay: bool=False, profile_ir=False) -> str: # collect the ncu trace import sys import subprocess @@ -910,7 +914,7 @@ def ncu_trace(self, input_id: int, fn_name: str, replay: bool=False) -> str: ncu_output_dir = self.get_temp_path(f"ncu_traces/{fn_name}_{input_id}") ncu_output_dir.mkdir(parents=True, exist_ok=True) ext = ".csv" if not replay else ".ncu-rep" - ncu_output_file = ncu_output_dir.joinpath(f"ncu_output{ext}").resolve() + ncu_output_file = ncu_output_dir.joinpath(f"ncu_output{'_ir' if profile_ir else ''}{ext}").resolve() ncu_args = [ "ncu", "--set", @@ -940,7 +944,10 @@ def ncu_trace(self, input_id: int, fn_name: str, replay: bool=False) -> str: logger.info("Running NCU: %s", shlex.join(ncu_args)) # Sometimes, `ncu --target-processes all` will fail with the message "Failed to connect to process". Setting # CUDA_INJECTION64_PATH=none seems to fix this issue. - subprocess.check_call(ncu_args, env={**os.environ, "CUDA_INJECTION64_PATH": "none"}) + env = {**os.environ, "CUDA_INJECTION64_PATH": "none"} + if profile_ir: + env["USE_TTGIR_LOC"] = "1" + subprocess.check_call(ncu_args, env=env) return str(ncu_output_file.resolve()) def kineto_trace(self, input_id: int, fn: Callable) -> str: