From a49841bcb25268c75369d488cd74e82630a076d0 Mon Sep 17 00:00:00 2001 From: FindHao Date: Tue, 8 Oct 2024 11:42:13 -0700 Subject: [PATCH 1/3] check dyno and dcgm exisitence before disable them --- torchbenchmark/util/triton_op.py | 42 +++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py index 9613c10d4..33e7e5ad3 100644 --- a/torchbenchmark/util/triton_op.py +++ b/torchbenchmark/util/triton_op.py @@ -23,6 +23,7 @@ import tabulate import torch import triton + from torchbenchmark.util.env_check import fresh_triton_cache, set_random_seed from torchbenchmark.util.experiment.metrics import get_peak_memory from torchbenchmark.util.extra_args import apply_decoration_args, parse_decoration_args @@ -1008,6 +1009,7 @@ def nsys_rep(self, input_id: int, fn_name: str) -> str: def ncu_trace( self, input_id: int, fn_name: str, replay: bool = False, profile_ir=False ) -> str: + import shutil import subprocess # collect the ncu trace @@ -1031,6 +1033,7 @@ def ncu_trace( "_ncu_trace_in_task", ] ) + # Disable DCGM disable_dyno_dcgm = [ "sudo", @@ -1045,13 +1048,44 @@ def ncu_trace( "stop", "nvidia-dcgm", ] - if ( - subprocess.run(disable_dyno_dcgm).returncode != 0 - and subprocess.run(disable_dcgm_service).returncode != 0 - ): + + def run_command_if_exists(command): + if shutil.which(command[0]): # Check if the command exists + try: + # Run the command + return subprocess.run(command, check=True).returncode + except (FileNotFoundError, subprocess.CalledProcessError): + return -1 + return -1 + + def service_exists(service_name): + try: + result = subprocess.run( + ["systemctl", "status", service_name], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + ) + return result.returncode == 0 + except subprocess.CalledProcessError: + return False + + # Attempt to disable DCGM using dyno if available + dyno_result = ( + run_command_if_exists(disable_dyno_dcgm) if shutil.which("dyno") else -1 + ) + + # Check if the nvidia-dcgm service exists before attempting to stop it + if service_exists("nvidia-dcgm"): + systemctl_result = run_command_if_exists(disable_dcgm_service) + else: + systemctl_result = -1 + + if dyno_result != 0 and systemctl_result != 0: warnings.warn( "DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..." ) + ncu_output_dir = self.get_temp_path(f"ncu_traces/{fn_name}_{input_id}") ncu_output_dir.mkdir(parents=True, exist_ok=True) ext = ".csv" if not replay else ".ncu-rep" From 3d17864ff3f1f5f0e9e2e75ebf6876ee742b7a08 Mon Sep 17 00:00:00 2001 From: FindHao Date: Tue, 8 Oct 2024 12:35:48 -0700 Subject: [PATCH 2/3] change how we warn --- torchbenchmark/util/triton_op.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py index 33e7e5ad3..5cc138b73 100644 --- a/torchbenchmark/util/triton_op.py +++ b/torchbenchmark/util/triton_op.py @@ -1070,22 +1070,13 @@ def service_exists(service_name): except subprocess.CalledProcessError: return False - # Attempt to disable DCGM using dyno if available - dyno_result = ( - run_command_if_exists(disable_dyno_dcgm) if shutil.which("dyno") else -1 - ) - - # Check if the nvidia-dcgm service exists before attempting to stop it - if service_exists("nvidia-dcgm"): + if shutil.which("dyno") or service_exists("nvidia-dcgm"): + dyno_result = run_command_if_exists(disable_dyno_dcgm) systemctl_result = run_command_if_exists(disable_dcgm_service) - else: - systemctl_result = -1 - - if dyno_result != 0 and systemctl_result != 0: - warnings.warn( - "DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..." - ) - + if dyno_result != 0 and systemctl_result != 0: + warnings.warn( + "DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..." + ) ncu_output_dir = self.get_temp_path(f"ncu_traces/{fn_name}_{input_id}") ncu_output_dir.mkdir(parents=True, exist_ok=True) ext = ".csv" if not replay else ".ncu-rep" From 37f9ac8fee90d586e96cb261976da3ddfcd2133e Mon Sep 17 00:00:00 2001 From: FindHao Date: Tue, 8 Oct 2024 12:38:41 -0700 Subject: [PATCH 3/3] remove extra checks --- torchbenchmark/util/triton_op.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py index 5cc138b73..f696998f6 100644 --- a/torchbenchmark/util/triton_op.py +++ b/torchbenchmark/util/triton_op.py @@ -1049,15 +1049,6 @@ def ncu_trace( "nvidia-dcgm", ] - def run_command_if_exists(command): - if shutil.which(command[0]): # Check if the command exists - try: - # Run the command - return subprocess.run(command, check=True).returncode - except (FileNotFoundError, subprocess.CalledProcessError): - return -1 - return -1 - def service_exists(service_name): try: result = subprocess.run( @@ -1071,8 +1062,8 @@ def service_exists(service_name): return False if shutil.which("dyno") or service_exists("nvidia-dcgm"): - dyno_result = run_command_if_exists(disable_dyno_dcgm) - systemctl_result = run_command_if_exists(disable_dcgm_service) + dyno_result = subprocess.run(disable_dyno_dcgm).returncode + systemctl_result = subprocess.run(disable_dcgm_service).returncode if dyno_result != 0 and systemctl_result != 0: warnings.warn( "DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..."