diff --git a/torchbenchmark/_components/ncu/analyzer.py b/torchbenchmark/_components/ncu/analyzer.py new file mode 100644 index 000000000..74a108a5b --- /dev/null +++ b/torchbenchmark/_components/ncu/analyzer.py @@ -0,0 +1,158 @@ +import os +import shutil +import sys +from collections import defaultdict +from typing import List + +""" +A dictionary mapping short metric names to their corresponding NVIDIA Nsight Compute +(NCU) metric names. Don't directly use the NCU metric names in the code, use these short +names instead. This mapping can help us manage the metrics we use in the benchmark. +""" +short_ncu_metric_name = { + "inst_executed_ffma_peak": "sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained", + "inst_executed_dfma_peak": "sm__sass_thread_inst_executed_op_dfma_pred_on.sum.peak_sustained", + "inst_executed_fadd": "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.per_cycle_elapsed", + "inst_executed_fmul": "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.per_cycle_elapsed", + "inst_executed_ffma": "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed", + "inst_executed_dadd": "smsp__sass_thread_inst_executed_op_dadd_pred_on.sum.per_cycle_elapsed", + "inst_executed_dmul": "smsp__sass_thread_inst_executed_op_dmul_pred_on.sum.per_cycle_elapsed", + "inst_executed_dfma": "smsp__sass_thread_inst_executed_op_dfma_pred_on.sum.per_cycle_elapsed", + "dram_bytes_write": "dram__bytes_write.sum", + "dram_bytes_read": "dram__bytes_read.sum", + "dram_bytes_per_second": "dram__bytes.sum.per_second", + "sm_freq": "smsp__cycles_elapsed.avg.per_second", + "dram_bandwidth": "dram__bytes.sum.per_second", + "duration": "gpu__time_duration.sum", +} +# A dictionary mapping benchmark metric names to their corresponding short NCU metric +# names. +bench_metric_to_short_ncu_metric = { + "memory_traffic": ["dram_bytes_write", "dram_bytes_read"], + "arithmetic_intensity": [ + "inst_executed_ffma_peak", + "inst_executed_dfma_peak", + "inst_executed_fadd", + "inst_executed_fmul", + "inst_executed_ffma", + "inst_executed_dadd", + "inst_executed_dmul", + "inst_executed_dfma", + "dram_bytes_write", + "dram_bytes_read", + "sm_freq", + "dram_bandwidth", + "duration", + ], +} + + +def import_ncu_python_path(): + """ + This function modifies the Python path to include the NVIDIA Nsight Compute (NCU) Python modules. + It searches for the 'ncu' command in the system PATH, determines its location, and appends the + 'extras/python' directory to the Python path. + + Raises: + FileNotFoundError: If the 'ncu' command is not found in the system PATH. + FileNotFoundError: If the 'extras/python' directory does not exist in the determined NCU path. + """ + ncu_path = shutil.which("ncu") + if not ncu_path: + raise FileNotFoundError("Could not find 'ncu' command in PATH.") + ncu_path = os.path.dirname(ncu_path) + if not os.path.exists(os.path.join(ncu_path, "extras/python")): + raise FileNotFoundError( + f"'extras/python' does not exist in the provided ncu_path: {ncu_path}" + ) + sys.path.append(os.path.join(ncu_path, "extras/python")) + + +def get_mem_traffic(kernel): + return ( + kernel.metric_by_name(short_ncu_metric_name["dram_bytes_read"]).value(), + kernel.metric_by_name(short_ncu_metric_name["dram_bytes_write"]).value(), + ) + + +def get_duration(kernel): + return kernel.metric_by_name(short_ncu_metric_name["duration"]).value() + + +# Reference: ncu_install_path/sections/SpeedOfLight_Roofline.py +# and ncu_install_path/sections/SpeedOfLight_RooflineChart.section +def get_arithmetic_intensity(kernel): + fp32_add_achieved = kernel.metric_by_name( + short_ncu_metric_name["inst_executed_fadd"] + ).value() + fp32_mul_achieved = kernel.metric_by_name( + short_ncu_metric_name["inst_executed_fmul"] + ).value() + fp32_fma_achieved = kernel.metric_by_name( + short_ncu_metric_name["inst_executed_ffma"] + ).value() + fp32_achieved = fp32_add_achieved + fp32_mul_achieved + 2 * fp32_fma_achieved + fp64_add_achieved = kernel.metric_by_name( + short_ncu_metric_name["inst_executed_dadd"] + ).value() + fp64_mul_achieved = kernel.metric_by_name( + short_ncu_metric_name["inst_executed_dmul"] + ).value() + fp64_fma_achieved = kernel.metric_by_name( + short_ncu_metric_name["inst_executed_dfma"] + ).value() + fp64_achieved = fp64_add_achieved + fp64_mul_achieved + 2 * fp64_fma_achieved + sm_freq = kernel.metric_by_name(short_ncu_metric_name["sm_freq"]).value() + fp32_flops = fp32_achieved * sm_freq + fp64_flops = fp64_achieved * sm_freq + dram_bandwidth = kernel.metric_by_name( + short_ncu_metric_name["dram_bandwidth"] + ).value() + fp32_arithmetic_intensity = fp32_flops / dram_bandwidth + fp64_arithmetic_intensity = fp64_flops / dram_bandwidth + return fp32_arithmetic_intensity, fp64_arithmetic_intensity + + +def read_ncu_report(report_path: str, required_metrics: List[str]): + assert os.path.exists( + report_path + ), f"The NCU report at {report_path} does not exist. Ensure you add --metrics ncu_rep to your benchmark run." + import_ncu_python_path() + import ncu_report + + # save all kernels' metrics. {metric_name: [kernel1_metric_value, kernel2_metric_value, ...]} + results = defaultdict(list) + test_report = ncu_report.load_report(report_path) + assert ( + test_report.num_ranges() > 0 + ), f"No profile data found in the NCU report at {report_path}" + default_range = test_report.range_by_idx(0) + assert ( + default_range.num_actions() > 0 + ), f"No profile data found in the default range of the NCU report at {report_path}" + total_duration = 0 + weighted_fp32_ai_sum = 0 + weighted_fp64_ai_sum = 0 + for i in range(default_range.num_actions()): + kernel = default_range.action_by_idx(i) + duration = get_duration(kernel) + if "memory_traffic" in required_metrics: + results["memory_traffic"].append(get_mem_traffic(kernel)) + if "arithmetic_intensity" in required_metrics: + fp32_ai, fp64_ai = get_arithmetic_intensity(kernel) + weighted_fp32_ai_sum += fp32_ai * duration + weighted_fp64_ai_sum += fp64_ai * duration + results["arithmetic_intensity"].append((fp32_ai, fp64_ai)) + results["durations"].append(duration) + total_duration += duration + memory_traffic_read = [item[0] for item in results["memory_traffic"]] + memory_traffic_write = [item[1] for item in results["memory_traffic"]] + results["memory_traffic_read_sum"] = sum(memory_traffic_read) + results["memory_traffic_write_sum"] = sum(memory_traffic_write) + results["weighted_fp32_arithmetic_intensity"] = ( + weighted_fp32_ai_sum / total_duration + ) + results["weighted_fp64_arithmetic_intensity"] = ( + weighted_fp64_ai_sum / total_duration + ) + return results diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py index f696998f6..bb112c468 100644 --- a/torchbenchmark/util/triton_op.py +++ b/torchbenchmark/util/triton_op.py @@ -24,6 +24,7 @@ import torch import triton +from torchbenchmark._components.ncu import analyzer as ncu_analyzer from torchbenchmark.util.env_check import fresh_triton_cache, set_random_seed from torchbenchmark.util.experiment.metrics import get_peak_memory from torchbenchmark.util.extra_args import apply_decoration_args, parse_decoration_args @@ -865,8 +866,30 @@ def _init_extra_metrics() -> Dict[str, Any]: metrics.compile_time = self.compile_time(input_id, fn_name, metrics) if "ncu_trace" in self.required_metrics: metrics.ncu_trace = self.ncu_trace(input_id, fn_name) + # Collect NCU metrics if any required metrics match the ncu analyzer + # metrics. Only profile with the necessary metrics to avoid excessive + # overhead. + ncu_metrics = [ + ncu_analyzer.short_ncu_metric_name[short_ncu_metric] + for bench_metric, short_ncu_metrics in ncu_analyzer.bench_metric_to_short_ncu_metric.items() + if bench_metric in self.required_metrics + for short_ncu_metric in short_ncu_metrics + ] if "ncu_rep" in self.required_metrics: - metrics.ncu_rep = self.ncu_trace(input_id, fn_name, replay=True) + if ncu_metrics: + extend_ncu_args = ["--metrics", ",".join(ncu_metrics)] + else: + extend_ncu_args = None + metrics.ncu_rep = self.ncu_trace( + input_id, fn_name, replay=True, extend_ncu_args=extend_ncu_args + ) + # Read and update NCU metrics if any required metrics match the NCU metrics + if ncu_metrics: + ncu_analyzer_results = ncu_analyzer.read_ncu_report( + metrics.ncu_rep, self.required_metrics + ) + for metric_name, metric_value in ncu_analyzer_results.items(): + metrics.extra_metrics[metric_name] = metric_value if "ncu_rep_ir" in self.required_metrics: metrics.ncu_rep_ir = self.ncu_trace( input_id, fn_name, replay=True, profile_ir=True @@ -1007,7 +1030,12 @@ def nsys_rep(self, input_id: int, fn_name: str) -> str: return str(nsys_output_file.resolve()) def ncu_trace( - self, input_id: int, fn_name: str, replay: bool = False, profile_ir=False + self, + input_id: int, + fn_name: str, + replay: bool = False, + profile_ir=False, + extend_ncu_args: List[str] = None, ) -> str: import shutil import subprocess @@ -1015,6 +1043,10 @@ def ncu_trace( # collect the ncu trace import sys + extend_ncu_args = extend_ncu_args or [ + "--set", + "full", + ] op_task_args = [] if IS_FBCODE else [sys.executable] op_task_args.extend(copy.deepcopy(sys.argv)) for override_option in ["--only", "--input-id", "--num-inputs", "--metrics"]: @@ -1076,8 +1108,6 @@ def service_exists(service_name): ).resolve() ncu_args = [ "ncu", - "--set", - "full", "--nvtx", "--nvtx-include", f"{_RANGE_NAME}/", @@ -1086,6 +1116,7 @@ def service_exists(service_name): "--import-source", "yes", ] + ncu_args.extend(extend_ncu_args) if replay: ncu_args.extend( [